r - 在 R 中使用 tidyjson 从嵌套 json 文档中提取数据

标签 r jsonlite

我正在查询研究出版物数据库。文章有不同数量的作者(从 1 到超过 20)。我的目标是使用 iGraph 创建用于社交网络分析的共同作者边缘列表。下面是一段 json

{
"format": "linked-data-api",
"version": "0.2",
"result": {
"_about": "http://network.csiro.au:9500/standalone/publications.json?_pageSize=5&_page=1",
"definition": "http://network.csiro.au:9500/standalone/meta/publications.json",
"extendedMetadataVersion": "http://network.csiro.au:9500/standalone/publications.json?_pageSize=5&_page=1&_metadata=all",
"first": "http://network.csiro.au:9500/standalone/publications.json?_page=0",
"isPartOf": {
"_about": "http://network.csiro.au:9500/standalone/publications.json",
"definition": "http://network.csiro.au:9500/standalone/meta/publications.json",
"hasPart": "http://network.csiro.au:9500/standalone/publications.json?_pageSize=5&_page=1",
"type": [
"http://purl.org/linked-data/api/vocab#ListEndpoint"
]
},
"items": [
{
"_about": "http://network.csiro.au/data/pub_EP1312922",
"access": "Public",
"author": {
"_about": "http://network.csiro.au/data/aimee.slangen",
"hasName": {
"_about": "http://network.csiro.au/data/aimee.slangen_name",
"firstName": "Aimee",
"lastName": "Slangen",
"title": "Ms"
}
},
"authorSeq": {
"_about": "http://network.csiro.au/data/pub_EP1312922_author_0",
"author": {
"_about": "http://network.csiro.au/data/aimee.slangen",
"hasName": {
"_about": "http://network.csiro.au/data/aimee.slangen_name",
"firstName": "Aimee",
"lastName": "Slangen",
"title": "Ms"
}
},
"sequenceNumber": 0
},
"classification": {
"_about": "http://network.csiro.au/data/classification_code_040104",
"name": "Climate Change Processes"
},
"classificationLevel": "http://network.csiro.au/data/unclassified",
"journalTitle": "Journal of Geophysical Research-Oceans",
"keyword": " ",
"outcome": "Approved",
"pages": "156-164",
"project": "http://network.csiro.au/data/project_PD00003609",
"publicationVolume": "119",
"publishedDate": "9-Jan-2014",
"publisher": "American Geophysical Union",
"title": "Regional Differences of Relative Sea Level Changes in the Northwest Atlantic: Historical Trends and Future Projections",
"wbscode": "R-03426-01-003",
"yearOfPublication": "2014"
},
{
"_about": "http://network.csiro.au/data/pub_EP112347",
"access": "Public",
"author": {
"_about": "http://network.csiro.au/data/roland.pitcher",
"hasName": {
"_about": "http://network.csiro.au/data/roland.pitcher_name",
"firstName": "Roland",
"lastName": "Pitcher",
"title": "Dr"
}
},
"authorSeq": {
"_about": "http://network.csiro.au/data/pub_EP112347_author_0",
"author": {
"_about": "http://network.csiro.au/data/roland.pitcher",
"hasName": {
"_about": "http://network.csiro.au/data/roland.pitcher_name",
"firstName": "Roland",
"lastName": "Pitcher",
"title": "Dr"
}
},
"sequenceNumber": 0
},
"classification": {
"_about": "http://network.csiro.au/data/classification_code_050209",
"name": "Natural Resource Management"
},
"classificationLevel": "http://network.csiro.au/data/unclassified",
"keyword": " ",
"outcome": "Approved",
"project": "http://network.csiro.au/data/project_PD00000752",
"publisher": "Queensland Department of Environment and Resource Management",
"title": "Understanding and Managing the Effects of Trawling on the Seabed in the Great Barrier Reef",
"wbscode": "R-00654-03-003",
"yearOfPublication": " "
},
{
"_about": "http://network.csiro.au/data/pub_EP148991",
"access": "CSIRO Only",
"author": {
"_about": "http://network.csiro.au/data/rob.bramley",
"hasName": {
"_about": "http://network.csiro.au/data/rob.bramley_name",
"firstName": "Rob",
"lastName": "Bramley",
"title": "Dr"
}
},
"authorSeq": {
"_about": "http://network.csiro.au/data/pub_EP148991_author_0",
"author": {
"_about": "http://network.csiro.au/data/rob.bramley",
"hasName": {
"_about": "http://network.csiro.au/data/rob.bramley_name",
"firstName": "Rob",
"lastName": "Bramley",
"title": "Dr"
}
},
"sequenceNumber": 0
},
"classification": {
"_about": "http://network.csiro.au/data/classification_code_070107",
"name": "Farming Systems Research"
},
"classificationLevel": "http://network.csiro.au/data/unclassified",
"keyword": " ",
"outcome": "Approved",
"pages": "26 + appendices",
"project": "http://network.csiro.au/data/project_PD00002886",
"publishedDate": "17-Sep-2014",
"publisher": "SRA",
"title": "A collaborative approach to Precision Agriculture RDE for the Australian Sugar Industry",
"wbscode": "R-02709-01",
"yearOfPublication": "2014"
},
{
"_about": "http://network.csiro.au/data/pub_EP151976",
"access": "Public",
"author": {
"_about": "http://network.csiro.au/data/paul.krummel",
"hasName": {
"_about": "http://network.csiro.au/data/paul.krummel_name",
"firstName": "Paul",
"lastName": "Krummel",
"title": "Mr"
}
},
"authorSeq": {
"_about": "http://network.csiro.au/data/pub_EP151976_author_0",
"author": {
"_about": "http://network.csiro.au/data/paul.krummel",
"hasName": {
"_about": "http://network.csiro.au/data/paul.krummel_name",
"firstName": "Paul",
"lastName": "Krummel",
"title": "Mr"
}
},
"sequenceNumber": 0
},
"classification": [
{
"_about": "http://network.csiro.au/data/classification_code_040104",
"name": "Climate Change Processes"
},
{
"_about": "http://network.csiro.au/data/classification_code_040199",
"name": "Atmospheric Sciences not elsewhere classified"
}
],
"classificationLevel": "http://network.csiro.au/data/unclassified",
"journalTitle": "Atmospheric Chemistry and Physics",
"keyword": [
"CH4",
"OH",
"hydroxyl radical",
"methane"
],
"outcome": "Approved",
"pages": "7943\u20137956",
"project": "http://network.csiro.au/data/project_PD00009165",
"publicationVolume": "16",
"publishedDate": "30-Jun-2016",
"publisher": "Copernicus GmbH",
"title": "Role of OH variability in the stalling of the global atmospheric CH4 growth rate from 1999 to 2006",
"wbscode": "R-07848; R-06420; R-07768",
"yearOfPublication": "2016"
},
{
"_about": "http://network.csiro.au/data/pub_EP152677",
"access": "CSIRO Only",
"author": [
{
"_about": "http://network.csiro.au/data/andrew.george",
"hasName": {
"_about": "http://network.csiro.au/data/andrew.george_name",
"firstName": "Andrew",
"lastName": "George",
"title": "Dr"
}
},
{
"_about": "http://network.csiro.au/data/sigrid.lehnert",
"hasName": {
"_about": "http://network.csiro.au/data/sigrid.lehnert_name",
"firstName": "Sigrid",
"lastName": "Lehnert",
"title": "Dr"
}
},
{
"_about": "http://network.csiro.au/data/toni.reverter-gomez",
"hasName": {
"_about": "http://network.csiro.au/data/toni.reverter-gomez_name",
"firstName": "Toni",
"lastName": "Reverter-Gomez",
"title": "Dr"
}
},
{
"_about": "http://network.csiro.au/data/yutao.li",
"hasName": {
"_about": "http://network.csiro.au/data/yutao.li_name",
"firstName": "Yutao",
"lastName": "Li",
"title": "Dr"
}
}
],
"authorSeq": [
{
"_about": "http://network.csiro.au/data/pub_EP152677_author_0",
"author": {
"_about": "http://network.csiro.au/data/yutao.li",
"hasName": {
"_about": "http://network.csiro.au/data/yutao.li_name",
"firstName": "Yutao",
"lastName": "Li",
"title": "Dr"
}
},
"sequenceNumber": 0
},
{
"_about": "http://network.csiro.au/data/pub_EP152677_author_1",
"author": {
"_about": "http://network.csiro.au/data/andrew.george",
"hasName": {
"_about": "http://network.csiro.au/data/andrew.george_name",
"firstName": "Andrew",
"lastName": "George",
"title": "Dr"
}
},
"sequenceNumber": 1
},
{
"_about": "http://network.csiro.au/data/pub_EP152677_author_2",
"author": {
"_about": "http://network.csiro.au/data/sigrid.lehnert",
"hasName": {
"_about": "http://network.csiro.au/data/sigrid.lehnert_name",
"firstName": "Sigrid",
"lastName": "Lehnert",
"title": "Dr"
}
},
"sequenceNumber": 2
},
{
"_about": "http://network.csiro.au/data/pub_EP152677_author_3",
"author": {
"_about": "http://network.csiro.au/data/toni.reverter-gomez",
"hasName": {
"_about": "http://network.csiro.au/data/toni.reverter-gomez_name",
"firstName": "Toni",
"lastName": "Reverter-Gomez",
"title": "Dr"
}
},
"sequenceNumber": 3
}
],
"classification": {
"_about": "http://network.csiro.au/data/classification_code_070201",
"name": "Animal Breeding"
},
"classificationLevel": "http://network.csiro.au/data/unclassified",
"conferenceDate": "28th-30th September 2015",
"conferenceLocation": "Lorne, Victoria",
"conferenceName": "21st AAABG",
"keyword": " ",
"outcome": "Approved",
"pages": "433-436",
"project": "http://network.csiro.au/data/project_PD00005603",
"publicationVolume": "21",
"publishedDate": "25-Sep-2015",
"publisher": "Association for the Advancement of Animal Breeding and Genetics",
"title": "Using Random Forests to Identify SNP Associated With Leg Defect in Broiler Chicken: Impact of Correcting For Population Structures",
"wbscode": "R-05156",
"yearOfPublication": "2015"
}
],
"itemsPerPage": 5,
"next": "http://network.csiro.au:9500/standalone/publications.json?_page=2",
"page": 1,
"prev": "http://network.csiro.au:9500/standalone/publications.json?_page=0",
"startIndex": 6,
"totalResults": 47023,
"type": [
"http://purl.org/linked-data/api/vocab#Page"
]
}
}

我读取的数据如下:

library(jsonlite)
library(tidyjson)
pubs <- fromJSON("http://network.csiro.au:9500/standalone/publications.json?_page=1&_pageSize=5")

当尝试使用 tidyjson 提取有意义的数据时,我收到此错误:

pubs %>%
  as.tbl_json %>%
  enter_object("items")

Error in UseMethod("as.tbl_json") : 
  no applicable method for 'as.tbl_json' applied to an object of class "list"

我不是 R 或 JSON 方面的专家,因此希望得到一些指导。使用上面的示例,我想为每个出版物创建一个共同作者的边缘列表,如下所示:

_about                                    yearOfPublication from            to
http://network.url.com/data/pub_EP16079   2011              Colin Jackson   Holly Trueman
http://network.url.com/data/pub_EP16079   2011              Colin Jackson   Tara Sutherland
http://network.url.com/data/pub_EP16079   2011              Colin Jackson   Trevor Rapson
http://network.url.com/data/pub_EP16079   2011              Holly Trueman   Tara Sutherland
http://network.url.com/data/pub_EP16079   2011              Holly Trueman   Trevor Rapson
http://network.url.com/data/pub_EP16079   2011              Tara Sutherland Trevor Rapson

希望有人能帮助我!提前致谢。

最佳答案

这是一个有点棘手的例子。请参阅this issue讨论如何改进 tidyjson 处理有时是数组的对象的方式。

虽然不是最干净的解决方案,但我认为这确实可以完成工作 - 您可以将其中一些步骤组功能化以优化代码重用。

基本目标是解析足够的对象以到达作者,然后对对象数组使用单独的工作流程。这些数组需要 tidyr::expand 来完成所有作者的组合(因为这些组合未在数据中表示)

json <- paste(readLines("ex.json"), collapse = " ")

library(dplyr)
library(tidyjson)
library(tidyr)

## parse the objects.  Notice some publications have objects representing a
## single author, others have an array of many authors
prep <- json %>% 
 enter_object("result") %>% 
 enter_object("items") %>% 
 gather_array() %>% 
 spread_values(
   about = jstring("_about")
   , yearOfPublication = jstring(yearOfPublication)
 ) %>% 
 enter_object("author") %>% 
 json_types()

## parse object types
authorobj <- prep %>% 
filter(as.character(type) == "object") %>% 
spread_values(
 authorFirst = jstring(hasName, firstName)
 , authorLast = jstring(hasName, lastName)
) %>% 
mutate(from = paste(authorFirst, authorLast), to = from) %>% 
select(-authorFirst, -authorLast) %>% 
tbl_df()


## parse array types - get 'from' authors
authorarr <- prep %>% 
filter(as.character(type) == "array") %>% 
gather_array("authorid") %>% 
spread_values(
 authorFirst = jstring(hasName, firstName)
 , authorLast = jstring(hasName, lastName)
) %>% 
mutate(from = paste(authorFirst, authorLast)) %>% 
select(-authorFirst, -authorLast)


## use tidyr::expand to complete combinations of from/to
authorarr <- authorarr %>% 
tbl_df() %>% 
left_join(
  authorarr %>% 
   group_by(array.index) %>% 
   expand(from = authorarr$from, to = authorarr$from) %>% 
   ungroup()
  , by = c("array.index", "from"))

## stack (select only a few columns for display)
dplyr::bind_rows(authorobj, authorarr) %>% 
 select(array.index, from, to)
#> # A tibble: 20 x 3
#>    array.index                from                  to
#>          <int>               <chr>               <chr>
#>  1           1       Aimee Slangen       Aimee Slangen
#>  2           2      Roland Pitcher      Roland Pitcher
#>  3           3         Rob Bramley         Rob Bramley
#>  4           4        Paul Krummel        Paul Krummel
#>  5           5       Andrew George       Andrew George
#>  6           5       Andrew George      Sigrid Lehnert
#>  7           5       Andrew George Toni Reverter-Gomez
#>  8           5       Andrew George            Yutao Li
#>  9           5      Sigrid Lehnert       Andrew George
#> 10           5      Sigrid Lehnert      Sigrid Lehnert
#> 11           5      Sigrid Lehnert Toni Reverter-Gomez
#> 12           5      Sigrid Lehnert            Yutao Li
#> 13           5 Toni Reverter-Gomez       Andrew George
#> 14           5 Toni Reverter-Gomez      Sigrid Lehnert
#> 15           5 Toni Reverter-Gomez Toni Reverter-Gomez
#> 16           5 Toni Reverter-Gomez            Yutao Li
#> 17           5            Yutao Li       Andrew George
#> 18           5            Yutao Li      Sigrid Lehnert
#> 19           5            Yutao Li Toni Reverter-Gomez
#> 20           5            Yutao Li            Yutao Li

关于r - 在 R 中使用 tidyjson 从嵌套 json 文档中提取数据,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/40878226/

相关文章:

R:通过 cyphr 加密字符串,发送到 JSON,然后从 JS 转换回字符串会导致问题

JSON到R中的数据框

r - 来自软件包 doParallel 的选项 "cores"在 Windows 上没用?

r - 如何在水平图中将 x 轴标签旋转 90 度

r - “Error in open.connection(x, "rb ") : Timeout was reached”

r - 网页在浏览器中工作,但不是来自 R : SSL certificate problem: certificate has expired

json - 如何在名称/值对中将 R 数据帧转换为 Json?

R中的MySQL使用GROUP BY过滤最早的日期

r - 选择特定数字后的 n 行

json - 如何使用管道工 R 发送 json 响应