我尝试复制网址以进行网页抓取。它从给定的开始日期循环到结束日期,这是我的代码;
startDate <- as.Date("01-11-17", format="%d-%m-%y")
endDate <- as.Date("31-01-18",format="%d-%m-%y")
theDay <- startDate
while (theDay <= endDate)
{
dy <- as.character(theDay, format="%d")
month <- as.character(theDay, format = "%m")
year <- as.character(theDay, format ="%Y")
wyoming <- "http://weather.uwyo.edu/cgi-bin/sounding?region=seasia&TYPE=TEXT%3ALIST&YEAR="
address <- paste0(wyoming,year,"&MONTH=",month,"&FROM=",dy,"00&T0=",dy,"00&STNM=48657")
print(address)
theDay = theDay + 1
}
我不太了解 html,但我喜欢这段代码 https://stackoverflow.com/a/52539658/7356308将数据转换为数据框,以后处理起来更简单。它收集网页响应并将数据提取到实际的列名称中。它工作正常..直到我合并循环任务。陈述;
Error in wx_dat[[1]] : subscript out of bounds
请就此提出建议...谢谢
library(httr)
library(rvest)
startDate <- as.Date("01-11-17", format="%d-%m-%y")
endDate <- as.Date("31-01-18",format="%d-%m-%y")
theDay <- startDate
while (theDay <= endDate)
{
dy <- as.character(theDay, format="%d")
month <- as.character(theDay, format = "%m")
year <- as.character(theDay, format ="%Y")
httr::GET(
url = "http://weather.uwyo.edu/cgi-bin/sounding",
query = list(
region = "seasia",
TYPE = "TEXT:list",
YEAR = year,
MONTH = month,
FROM = paste0(dy,"00"), #is this the root of problem?
STNM = "48657"
)
) -> res
#becoming html document
httr::content(res, as="parsed") %>% html_nodes("pre")-> wx_dat
#extract data
html_text(wx_dat[[1]]) %>% # turn the first <pre> node into text
strsplit("\n") %>% # split it into lines
unlist() %>% # turn it back into a character vector
{ col_names <<- .[3]; . } %>% # pull out the column names
.[-(1:5)] %>% # strip off the header
paste0(collapse="\n") -> readings # turn it back into a big text blob
readr::read_table(readings, col_names = tolower(unlist(strsplit(trimws(col_names),"\ +"))))
#data <- read_table(readings, col_names = tolower(unlist(strsplit(trimws(col_names),"\ +"))))
#to write csv..
print(theDay)
theDay = theDay + 1
}
最佳答案
我已将该函数封装到 non-CRAN package 中。您可以:
devtools::install_git("https://gitlab.com/hrbrmstr/unsound.git")
然后:
library(unsound)
library(magick)
library(tidyverse)
startDate <- as.Date("01-11-17", format="%d-%m-%y")
endDate <- as.Date("31-01-18",format="%d-%m-%y")
# make a sequence
days <- seq(startDate, endDate, "1 day")
# apply the sequence — note that I am not going to hit the server >80x for
# an example and *you* should add a Sys.sleep(5) before the call to
# get_sounding_data() to be kind to their servers.
lapply(days[1:4], function(day) {
get_sounding_data(
region = "seasia",
date = day,
from_hr = "00",
to_hr = "00",
station_number = "48657"
)
}) -> soundings_48657
## Warning message:
## In get_sounding_data(region = "seasia", date = day, from_hr = "00", :
## Can't get 48657 WMKD Kuantan Observations at 00Z 01 Nov 2017.
rbind_soundings(soundings_48657)
## # A tibble: 176 x 14
## pres_hpa hght_m temp_c dwpt_c relh_pct mixr_g_kg drct_deg sknt_knot
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1006. 16. 24.0 23.4 96. 18.4 0. 0.
## 2 1000. 70. 23.6 22.4 93. 17.4 0. 0.
## 3 993. 132. 23.2 21.5 90. 16.6 NA NA
## 4 981. 238. 24.6 21.6 83. 16.9 NA NA
## 5 1005. 16. 24.2 23.6 96. 18.6 190. 1.
## 6 1000. 62. 24.2 23.1 94. 18.2 210. 3.
## 7 991. 141. 24.0 22.9 94. 18.1 212. 6.
## 8 983. 213. 23.8 22.7 94. 18.0 213. 8.
## 9 973. 302. 23.3 22.0 92. 17.4 215. 11.
## 10 970. 329. 23.2 21.8 92. 17.3 215. 11.
## # ... with 166 more rows, and 6 more variables: thta_k <dbl>,
## # thte_k <dbl>, thtv_k <dbl>, date <date>, from_hr <chr>, to_hr <chr>
我还添加了一个函数来检索预先生成的 map :
get_sounding_map(
station_number = "48657",
date = Sys.Date()-1,
map_type = "skewt",
map_format = "gif",
region = "seasia",
from_hr = "00",
to_hr = "00"
)
关于r - 使用 httr 进行网页抓取会出现 xml_nodeset 错误,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/52543892/