html - 莱克斯堡 : webscraping an HTML table in C

标签 html c web-scraping

目标:

查找并打印 <td> 之间的一个值来自 HTML table 的标签使用lexborThe details and source code of Lexbor can be found here.

更多详细信息:

有很多<td>标签,每个标签都由唯一的 header 表示。下面是一个简单的例子,其中只有第一列值 0.7感兴趣(即带有 header="choose-this-header" 的标签)。

<table>
<tbody>
<tr>
    <td header="choose-this-header">0.7</td>
    <td header="ignore-this-header">1.3</td>
    <td header="ignore-this-header">5.4</td>
</tr>
</tbody>
</table>

因此,找到该值的最佳方法似乎是:

  1. 搜索 HTML对于元素 header="chosen-header"
  2. 隔离 HTML 的这一行,并提取 <td>...</td> 之间的值标签

问题:

Based on this lexbor example , step_one.c如下所示,成功检测到 HTML 行包含所需的header ,但它在终端上打印为 <td header="choose-this-header">没有文本值或结束 </td>标签。如果有办法将整行(即 <td header="choose-this-header">0.7</td> )保存到缓冲区中,则程序 step_two.c based on this example下面可用于提取 0.7 的文本值.

step_one.c

#include "base.h"

#include <lexbor/dom/dom.h>


static void
print_collection_elements(lxb_dom_collection_t *collection)
{
    lxb_dom_element_t *element;

    for (size_t i = 0; i < lxb_dom_collection_length(collection); i++) {
        element = lxb_dom_collection_element(collection, i);

        serialize_node(lxb_dom_interface_node(element));
    }

    lxb_dom_collection_clean(collection);
}

int
main(int argc, const char *argv[])
{
    lxb_status_t status;
    lxb_dom_element_t *body;
    lxb_html_document_t *document;
    lxb_dom_collection_t *collection;

    const lxb_char_t html[] = "<table>"
            "<tbody>"
            "<tr>"
            "<td header=\"choose-this-header\">0.7</td>"
            "<td header=\"ignore-this-header\">1.3</td>"
            "<td header=\"ignore-this-header\">5.4</td>"
            "</tr>"
            "</tbody>"
            "</table>";

    size_t html_szie = sizeof(html) - 1;

    PRINT("HTML:");
    PRINT("%s", (const char *) html);

    document = parse(html, html_szie);

    body = lxb_dom_interface_element(document->body);

    collection = lxb_dom_collection_make(&document->dom_document, 128);
    if (collection == NULL) {
        FAILED("Failed to create Collection object");
    }

    /* Full match */
    status = lxb_dom_elements_by_attr(body, collection,
                                      (const lxb_char_t *) "header", 6,
                                      (const lxb_char_t *) "choose-this-header", 18,
                                      true);
    if (status != LXB_STATUS_OK) {
        FAILED("Failed to get elements by name");
    }

    PRINT("\nFull match by 'choose-this-header':");
    print_collection_elements(collection);

    lxb_dom_collection_destroy(collection, true);
    lxb_html_document_destroy(document);

    return 0;
}

step_one.c 输出:

HTML:
<table><tbody><tr><td header="choose-this-header">0.7</td><td header="ignore-this-header">1.3</td><td header="ignore-this-header">5.4</td></tr></tbody></table>

Full match by 'choose-this-header':
<td header="choose-this-header"> // no text value or closing tag is printed

step_two.c

#include "lexbor/html/tokenizer.h"


#define FAILED(...)                                                            \
    do {                                                                       \
        fprintf(stderr, __VA_ARGS__);                                          \
        fprintf(stderr, "\n");                                                 \
        exit(EXIT_FAILURE);                                                    \
    }                                                                          \
    while (0)


static lxb_html_token_t *
token_callback(lxb_html_tokenizer_t *tkz, lxb_html_token_t *token, void *ctx)
{
    /* Skip all not #text tokens */
    if (token->tag_id != LXB_TAG__TEXT) {
        return token;
    }

    printf("%.*s", (int) (token->text_end - token->text_start),
           token->text_start);

    return token;
}

int
main(int argc, const char *argv[])
{
    lxb_status_t status;
    lxb_html_tokenizer_t *tkz;

    const lxb_char_t data[] = "<td headers=\"choose-this-header\">0.7</td>";

    printf("HTML:\n%s\n\n", (char *) data);
    printf("Result:\n");

    tkz = lxb_html_tokenizer_create();
    status = lxb_html_tokenizer_init(tkz);
    if (status != LXB_STATUS_OK) {
        FAILED("Failed to create tokenizer object");
    }

    /* Set callback for token */
    lxb_html_tokenizer_callback_token_done_set(tkz, token_callback, NULL);

    status = lxb_html_tokenizer_begin(tkz);
    if (status != LXB_STATUS_OK) {
        FAILED("Failed to prepare tokenizer object for parsing");
    }

    status = lxb_html_tokenizer_chunk(tkz, data, (sizeof(data) - 1));
    if (status != LXB_STATUS_OK) {
        FAILED("Failed to parse the html data");
    }

    status = lxb_html_tokenizer_end(tkz);
    if (status != LXB_STATUS_OK) {
        FAILED("Failed to ending of parsing the html data");
    }

    printf("\n");

    lxb_html_tokenizer_destroy(tkz);

    return 0;
}

step_two.c 输出:

HTML:
<td headers="choose-this-header">0.7</td>

Result:
0.7

其他详细信息:

  1. 最好继续使用 lexbor因为它很快
  2. 使用Ubuntu 20.04.1 LTS
  3. 使用 gcc myprogram.c -llexbor -o myprogram 进行编译
  4. 各种操作系统的安装说明 found here

摘要问题:

Q1.程序如何step_one.c是否进行修改以将 ENTIRE 行保存到缓冲区中?一旦实现了这一点,将两个程序合并成一个变量 data[] 的程序就会相对简单。在step_two.c将是使用step_one.c中看到的逻辑找到的整行.

最佳答案

这是一个例子:

#include <lexbor/html/html.h>
#include <lexbor/css/css.h>
#include <lexbor/selectors/selectors.h>


lxb_status_t
callback(const lxb_char_t *data, size_t len, void *ctx)
{
    printf("%.*s", (int) len, (const char *) data);

    return LXB_STATUS_OK;
}

lxb_status_t
find_callback(lxb_dom_node_t *node, lxb_css_selector_specificity_t *spec,
              void *ctx)
{
    printf("Tag:\n");

    /* Print only <td> tag. */
    (void) lxb_html_serialize_cb(node, callback, NULL);

    printf("\n\nTag with children:\n");

    /* Print <td> element and all children in <td>. */
    (void) lxb_html_serialize_tree_cb(node, callback, NULL);

    printf("\n\nChildren:\n");

    /* Print children in <td>. */
    (void) lxb_html_serialize_deep_cb(node, callback, NULL);

    /* Use lxb_html_serialize_*_str(...) for buffer. */

    return LXB_STATUS_OK;
}

int main(void) {
    lxb_status_t status;
    lxb_dom_node_t *body;
    lxb_html_document_t *document;
    lxb_css_parser_t *parser;
    lxb_selectors_t *selectors;
    lxb_css_selector_list_t *list;

    const lxb_char_t html[] = "<table>"
            "<tbody>"
            "<tr>"
            "<td header=\"choose-this-header\">0.7</td>"
            "<td header=\"ignore-this-header\">1.3</td>"
            "<td header=\"ignore-this-header\">5.4</td>"
            "</tr>"
            "</tbody>"
            "</table>";

    static const lxb_char_t slctrs[] = "td[header='choose-this-header']";

    document = lxb_html_document_create();
    if (document == NULL) {
        return EXIT_FAILURE;
    }

    status = lxb_html_document_parse(document, html, sizeof(html) - 1);
    if (status != LXB_STATUS_OK) {
        return EXIT_FAILURE;
    }

    /* Create CSS parser. */

    parser = lxb_css_parser_create();
    status = lxb_css_parser_init(parser, NULL, NULL);
    if (status != LXB_STATUS_OK) {
        return EXIT_FAILURE;
    }

    /* Selectors. */

    selectors = lxb_selectors_create();
    status = lxb_selectors_init(selectors);
    if (status != LXB_STATUS_OK) {
        return EXIT_FAILURE;
    }

    list = lxb_css_selectors_parse(parser, slctrs, sizeof(slctrs) - 1);
    if (parser->status != LXB_STATUS_OK) {
        return EXIT_FAILURE;
    }

    /* Find DOM/HTML nodes by selectors. */

    body = lxb_dom_interface_node(lxb_html_document_body_element(document));
    if (body == NULL) {
        return EXIT_FAILURE;
    }

//    lxb_html_serialize_deep_cb(body, callback, NULL);

    status = lxb_selectors_find(selectors, body, list, find_callback, NULL);
    if (status != LXB_STATUS_OK) {
        return EXIT_FAILURE;
    }

    printf("\n");

    /* Destroy Selectors object. */
    (void) lxb_selectors_destroy(selectors, true);

    /* Destroy resources for CSS Parser. */
    (void) lxb_css_parser_destroy(parser, true);

    /* Destroy all Selector List memory. */
    lxb_css_selector_list_destroy_memory(list);

    /* Destroy HTML Document. */
    lxb_html_document_destroy(document);

    return 0;
}

输出:

Tag:
<td header="choose-this-header">

Tag with children:
<td header="choose-this-header">0.7</td>

Children:
0.7

关于html - 莱克斯堡 : webscraping an HTML table in C,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/69827153/

相关文章:

html - 液体 div 图像叠加

javascript 获取日期时间输入并根据输入过滤列表

html - 如何根据字母表的长度更改 css 选项卡的宽度?

GCC 中的 C 函数对齐

c - 从函数指针中读取数据是否合法?

excel - VBA 使用 JavaScript 元素从 URL 中抓取 HTML

python - 如何从无限滚动网页中抓取正确数量的 URL?

python - 将 Scrapy 请求 URL 添加到 Parsed Array 中

javascript - 在 jQuery 中悬停时处理 CSS 更改(用户选择的颜色)

c - pthread_create空间不足