c - 与 Safari Web Inspector 中使用 libxml2 类似的 DOM 树

标签 c html-parsing libxml2

如何使用 libxml2 获取特定 DOM 对象的 DOM 树,例如 Safari 的 Web 检查器

Safari Screenshot

最佳答案

使用此示例代码,您可以请求一个 TAG,如果它存在于您的 HTML 中,程序将转储它(我在这里使用了 stackoverflow 中的 head 标记,在您的代码中您可能需要使用 libcurl 获取 HTML 缓冲区):

/* Compile like this :
 * gcc -Wall html_dom_dump.c -o html_dom_dump `xml2-config --cflags` `xml2-config --libs` 
 */
#include <stdio.h>
#include <libxml/HTMLparser.h>
#include <libxml/tree.h>
#include <stdlib.h>

char stackoverflow_html_head[] = "<head>\
    <title>Stack Overflow</title>\
    <link rel=\"shortcut icon\" href=\"http://cdn.sstatic.net/stackoverflow/img/favicon.ico\">\
    <link rel=\"apple-touch-icon\" href=\"http://cdn.sstatic.net/stackoverflow/img/apple-touch-icon.png\">\
    <link rel=\"search\" type=\"application/opensearchdescription+xml\" title=\"Stack Overflow\" href=\"/opensearch.xml\">\
\
\
        StackExchange.init({\"stackAuthUrl\":\"https://stackauth.com\",\"serverTime\":1345183802,\"styleCode\":true,\"enableUserHovercards\":true,\"site\":{\"name\":\"Stack Overflow\",\"description\":\"Q\\u0026A for professional and enthusiast programmers\",\"isNoticesTabEnabled\":true,\"newTitleSearchBoxEnabled\":false,\"enableSocialMediaInSharePopup\":true},\"user\":{\"isAnonymous\":true,\"fkey\":\"52eb3bfedea6eccd9936d40e8ca0c8de\",\"notificationsUnviewedCount\":0,\"inboxUnviewedCount\":-1}});        StackExchange.using.setCacheBreakers({\"js/prettify-full.js\":\"d1cd9a23171c\",\"js/moderator.js\":\"8c49fc268737\",\"js/full-anon.js\":\"945170d238e3\",\"js/full.js\":\"c60de8021771\",\"js/wmd.js\":\"93b92575f8bc\",\"js/third-party/jquery.autocomplete.min.js\":\"e5f01e97f7c3\",\"js/mobile.js\":\"6eb68240242f\",\"js/help.js\":\"fc9fb0517db2\",\"js/tageditor.js\":\"c1ba807b32aa\",\"js/tageditornew.js\":\"bd66fabe1c71\",\"js/inline-tag-editing.js\":\"be882e188985\",\"js/revisions.js\":\"8c6bcd93b7fe\",\"js/suggested-edits.js\":\"46c4696efca5\",\"js/probes.js\":\"beb933322ff0\",\"js/review.js\":\"fca067ef962b\"});\
    </script>\
\
</head>";

int found = 0;

int walk_tree(xmlNode *node, xmlDocPtr doc, char *pattern)
{
        xmlNode *cur_node = NULL;

        for (cur_node = node; cur_node; cur_node = cur_node->next)
        {
                if ((!xmlStrcmp(cur_node->name, (const xmlChar *)pattern)))
                {
                        found++;
                        fprintf(stdout, "\n----> WE GOT IT\n\n");
                        xmlElemDump(stdout, doc, cur_node);
                        fprintf(stdout, "\n<----\n");
                }
                walk_tree(cur_node->children, doc, pattern);
        }

        return found;
}

int main(int argc, char **argv)
{
        int ret;
        /* Create a parser context*/
        htmlParserCtxtPtr html_parser = htmlCreatePushParserCtxt(NULL, NULL, NULL, 0, NULL, 0);

        if (argc != 2)
        {
                fprintf(stderr, "Usage : ./html_dom_dump TAG");

                exit(EXIT_FAILURE);
        }

        /* remove blank nodes
         * suppress error reports
         * suppress warning reports
         * Forbid network access
         * more on this options: http://xmlsoft.org/html/libxml-HTMLparser.html#htmlParserOption
         */
        htmlCtxtUseOptions(html_parser, HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET);
        /* parsing our stackoverflow html header */
        htmlParseChunk(html_parser, stackoverflow_html_head, sizeof(stackoverflow_html_head), 0);
        /* Traverse all the tree to find the given TAG (pattern) */
        ret = walk_tree(xmlDocGetRootElement(html_parser->myDoc), html_parser->myDoc, argv[1]);
        if (!ret)
                fprintf(stdout, "No luck, this tag does not exit!\n");

        return 0;
}

使用 libxml2 进行编译和链接:

gcc -Wall html_dom_dump.c -o html_dom_dump `xml2-config --cflags` `xml2-config --libs`

你可以像这样运行它:

toc@UnixServer:~$ ./html_dom_dump head

----> WE GOT IT

<head>
<title>Stack Overflow</title>
<link rel="shortcut icon" href="http://cdn.sstatic.net/stackoverflow/img/favicon.ico">
<link rel="apple-touch-icon" href="http://cdn.sstatic.net/stackoverflow/img/apple-touch-icon.png">
<link rel="search" type="application/opensearchdescription+xml" title="Stack Overflow" href="/opensearch.xml">
</head>

<----
toc@UnixServer:~$ ./html_dom_dump link

----> WE GOT IT

<link rel="shortcut icon" href="http://cdn.sstatic.net/stackoverflow/img/favicon.ico">

<----

----> WE GOT IT

<link rel="apple-touch-icon" href="http://cdn.sstatic.net/stackoverflow/img/apple-touch-icon.png">

<----

----> WE GOT IT

<link rel="search" type="application/opensearchdescription+xml" title="Stack Overflow" href="/opensearch.xml">
<----
toc@UnixServer:~$ ./html_dom_dump TAG
No luck, this tag does not exit!

如果您不知道,您还可以使用 libcurl + LibTidy 来获取并解析您的 HTML: http://curl.haxx.se/libcurl/c/htmltidy.html

关于c - 与 Safari Web Inspector 中使用 libxml2 类似的 DOM 树,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/11985108/

相关文章:

c - 如何使用 C 中的 libxml 删除 XML 中元素/节点之间的空格?

c - 如何在c中打印 float 的地址

c - valgrind 报告奇怪的内存使用情况

php - 从解析的 HTML 表构建数组

java - 有没有好的方法可以解析 HTML 简历

读取丑字后继续解析

java - 按位非运算符

c - 算术表达式也无法在 yacc 语法中正确解析

python - 使用 Beautiful Soup + Requests 时 find_all() 未找到任何结果

c - xmlNodeGetContent 引入换行符