html - 在 html_table(rvest) 中指定列类

标签 html r html-table rvest

我正在使用 rvest 的 html_table 从下面的网站读取一个包含两列的索引表。两列都包含我想要保留的前导零实例。因此,我希望这些列具有类(Class)特征。我使用以下代码:

library(rvest)
library(data.table)

df <- list() 
for (j in 1:25) {
     url <- paste('http://unstats.un.org/unsd/cr/registry/regso.asp?Ci=70&Lg=1&Co=&T=0&p=',
           j, '&prn=yes', sep='') 
     webpage <- read_html(url) 
     table <- html_nodes(webpage, 'table') 
     df[[j]] <- html_table(table, header=TRUE)[[1]] 
     df[[j]] <- df[[j]][,c(1:2) ] 
}
ISIC4.NACE2 <- rbindlist(df) 

但是 str(df[[1]]) 返回

'data.frame':   40 obs. of  2 variables:
$ ISIC Rev.4: chr  "01" "011" "0111" "0112" ...
$ NACE Rev.2: num  1 1.1 1.11 1.12 1.13 1.14 1.15 1.16 1.19 1.2 ...

似乎 html_table 函数将第一列解释为字符,将第二列解释为数字,从而截断了后者中的前导零。有没有办法使用 html_table 指定列类?

最佳答案

col_classes 应该是 NULLlist。如果是 list 那么它应该是这样的形式:

list(`COL#`=`class`, ...)

例如:

list(`1`='character', `3`='integer', `7`='logical')

您必须将以下所有内容提供给您正在使用 rvest 的 session ,因为它正在替换这些函数的 rvest S3 定义:

我将您代码中的 html_table 行更改为:

df[[j]] <- html_table(table, header=TRUE, col_classes=list(`2`='character'))[[1]] 

现在将以下内容作为 str 输出:

'data.frame':   40 obs. of  2 variables:
 $ ISIC Rev.4: int  1 11 111 112 113 114 115 116 119 12 ...
 $ NACE Rev.2: chr  "01" "01.1" "01.11" "01.12" ...

------ 来源下面的所有内容 ------

html_table <- function(x, header = NA, trim = TRUE, fill = FALSE, dec = ".", col_classes = NULL) {
  UseMethod("html_table")
}

' @export
html_table.xml_document <- function(x, header = NA, trim = TRUE, fill = FALSE,
                                    dec = ".", col_classes = NULL) {
  tables <- xml2::xml_find_all(x, ".//table")
  lapply(tables, html_table, header = header, trim = trim, fill = fill, dec = dec, col_classes)
}

html_table.xml_nodeset <- function(x, header = NA, trim = TRUE, fill = FALSE,
                                  dec = ".", col_classes = NULL) {
  # FIXME: guess useful names
  lapply(x, html_table, header = header, trim = trim, fill = fill, dec = dec, col_classes)
}


html_table.xml_node <- function(x, header = NA, trim = TRUE,
                                fill = FALSE, dec = ".", 
                                col_classes=NULL) {

  stopifnot(html_name(x) == "table")

  # Throw error if any rowspan/colspan present
  rows <- html_nodes(x, "tr")
  n <- length(rows)
  cells <- lapply(rows, "html_nodes", xpath = ".//td|.//th")

  ncols <- lapply(cells, html_attr, "colspan", default = "1")
  ncols <- lapply(ncols, as.integer)
  nrows <- lapply(cells, html_attr, "rowspan", default = "1")
  nrows <- lapply(nrows, as.integer)

  p <- unique(vapply(ncols, sum, integer(1)))
  maxp <- max(p)

  if (length(p) > 1 & maxp * n != sum(unlist(nrows)) &
      maxp * n != sum(unlist(ncols))) {
    # then malformed table is not parsable by smart filling solution
    if (!fill) { # fill must then be specified to allow filling with NAs
      stop("Table has inconsistent number of columns. ",
           "Do you want fill = TRUE?", call. = FALSE)
    }
  }

  values <- lapply(cells, html_text, trim = trim)
  out <- matrix(NA_character_, nrow = n, ncol = maxp)

  # fill colspans right with repetition
  for (i in seq_len(n)) {
    row <- values[[i]]
    ncol <- ncols[[i]]
    col <- 1
    for (j in seq_len(length(ncol))) {
      out[i, col:(col+ncol[j]-1)] <- row[[j]]
      col <- col + ncol[j]
    }
  }

  # fill rowspans down with repetition
  for (i in seq_len(maxp)) {
    for (j in seq_len(n)) {
      rowspan <- nrows[[j]][i]; colspan <- ncols[[j]][i]
      if (!is.na(rowspan) & (rowspan > 1)) {
        if (!is.na(colspan) & (colspan > 1)) {
          # special case of colspan and rowspan in same cell
          nrows[[j]] <- c(head(nrows[[j]], i),
                          rep(rowspan, colspan-1),
                          tail(nrows[[j]], length(rowspan)-(i+1)))
          rowspan <- nrows[[j]][i]
        }
        for (k in seq_len(rowspan - 1)) {
          l <- head(out[j+k, ], i-1)
          r <- tail(out[j+k, ], maxp-i+1)
          out[j + k, ] <- head(c(l, out[j, i], r), maxp)
        }
      }
    }
  }

  if (is.na(header)) {
    header <- all(html_name(cells[[1]]) == "th")
  }
  if (header) {
    col_names <- out[1, , drop = FALSE]
    out <- out[-1, , drop = FALSE]
  } else {
    col_names <- paste0("X", seq_len(ncol(out)))
  }

  # Convert matrix to list to data frame
  df <- lapply(seq_len(maxp), function(i) {
    if (!is.null(col_classes) & (i %in% names(col_classes))) {
      as(out[, i], col_classes[[as.character(i)]])
    } else {
      utils::type.convert(out[, i], as.is = TRUE, dec = dec)
    }
  })
  names(df) <- col_names
  class(df) <- "data.frame"
  attr(df, "row.names") <- .set_row_names(length(df[[1]]))

  if (length(unique(col_names)) < length(col_names)) {
    warning('At least two columns have the same name')
  }

  df
}

关于html - 在 html_table(rvest) 中指定列类,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/37293830/

相关文章:

html - 按钮和输入字段没有排成一排

r - 将 tibble 转换为带有列标题的数据框

javascript - jQuery 将动态添加的表行值复制到下一行

html - 表格宽度占页面的 100%,我需要表格与其他元素在同一行

javascript - 如何使用javascript更改表中的元素

jquery - 在多个 DIV 中使用 jquery 显示/隐藏内容

jquery - 输入无效值时向输入字段添加图标

r - 朴素贝叶斯 e1071 将每个姓氏归类为相同的祖先

r - R中跨多个列的简洁子集

jquery - 在循环中 `this` 内的元素之间插入元素