python-2.7 - 使用 <script> 和 var 从 BeautifulSoup 中提取数据

标签 python-2.7 web-scraping beautifulsoup

以下是我要从中提取“name”和“event_place”的页面的 html 代码的一部分。但是,我以前从未见过以这种复杂的方式塞入数据。在标签中,有“var person”,在其中,名称出现在“personBestName”下,即“John Stuart”。

与“var person”下的“event_place”类似……等等。事件地点实体应该是“B, Hamilton (city/cité), Ontario, Canada”

<script>

  var person = {"id":"p_14062397399","links":{"record":{"href":"https://familysearch.org/platform/records/records/9MFX-7VLY"},"persona":{"href":"https://familysearch.org/platform/records/personas/KH21-F11"}},"extracted":true,"identifiers":{"http://gedcomx.org/Persistent":["https://familysearch.org/ark:/61903/1:1:KH21-F11"],"$":["https://familysearch.org/platform/externalId/easy/1001080442645"]},"principal":true,"gender":{"type":"http://gedcomx.org/Male","fields":[{"type":"http://gedcomx.org/Gender","values":[{"type":"http://gedcomx.org/Interpreted","labelId":"PR_SEX_CODE","text":"Male","resource":"http://gedcomx.org/Male"}]}]},"names":[{"type":"http://gedcomx.org/BirthName","nameForms":[{"fullText":"John Stuart","parts":[{"type":"http://gedcomx.org/Given","value":"John","fields":[{"type":"http://gedcomx.org/Given","values":[{"type":"http://gedcomx.org/Interpreted","labelId":"PR_NAME_GN","text":"John"}]}]},{"type":"http://gedcomx.org/Surname","value":"Stuart","fields":[{"type":"http://gedcomx.org/Surname","values":[{"type":"http://gedcomx.org/Interpreted","labelId":"PR_NAME_SURN","text":"Stuart"}]}]}],"fields":[{"type":"http://gedcomx.org/Name","values":[{"type":"http://gedcomx.org/Interpreted","labelId":"PR_NAME","text":"John Stuart"}]}]}]}],"facts":[{"type":"http://gedcomx.org/MaritalStatus","value":"Single","fields":[{"type":"http://gedcomx.org/MaritalStatus","values":[{"type":"http://gedcomx.org/Interpreted","labelId":"PR_MARITAL_STATUS","text":"Single"}]}]},{"type":"http://gedcomx.org/Religion","value":"Presbyterian","fields":[{"type":"http://gedcomx.org/Religion","values":[{"type":"http://gedcomx.org/Interpreted","labelId":"PR_RELIGION","text":"Presbyterian"}]}]},{"type":"http://gedcomx.org/Nationality","value":"Canadian","fields":[{"type":"http://gedcomx.org/Nationality","values":[{"type":"http://gedcomx.org/Interpreted","labelId":"PR_NATIONALITY","text":"Canadian"}]}]},{"type":"http://gedcomx.org/Census","date":{"original":"31 Mar 1901","fields":[{"type":"http://gedcomx.org/Date","values":[{"type":"http://gedcomx.org/Interpreted","labelId":"EVENT_DATE","text":"31 Mar 1901"}]},{"type":"http://gedcomx.org/Year","values":[{"type":"http://gedcomx.org/Interpreted","labelId":"EVENT_YEAR","text":"1901"}]}]},"place":{"original":"B, Hamilton (city/cité), Ontario, Canada","fields":[{"type":"http://gedcomx.org/Place","values":[{"type":"http://gedcomx.org/Interpreted","labelId":"EVENT_PLACE","text":"B, Hamilton (city/cité), Ontario, Canada"}]}]},"primary":true},{"type":"http://gedcomx.org/Birth","date":{"original":"1831","fields":[{"type":"http://gedcomx.org/Year","values":[{"type":"http://gedcomx.org/Interpreted","labelId":"PR_BIRTH_YEAR_ESTIMATED","text":"1831"}]}]},"place":{"original":"Scotland","fields":[{"type":"http://gedcomx.org/Place","values":[{"type":"http://gedcomx.org/Interpreted","labelId":"PR_BIRTH_PLACE","text":"Scotland"}]}]}},{"type":"http://gedcomx.org/Immigration","date":{"original":"1848","fields":[{"type":"http://gedcomx.org/Year","values":[{"type":"http://gedcomx.org/Interpreted","labelId":"PR_IMMIGRATION_YEAR","text":"1848"}]}]}}],"fields":[{"type":"http://gedcomx.org/Age","values":[{"type":"http://gedcomx.org/Original","labelId":"PR_AGE_ORIG","text":"70"}]},{"type":"http://familysearch.org/types/fields/UniqueIdentifier","values":[{"type":"http://gedcomx.org/Original","labelId":"UNIQUE_IDENTIFIER","text":"1001080442645"}]},{"type":"http://familysearch.org/types/fields/HouseholdId","values":[{"type":"http://gedcomx.org/Original","labelId":"HOUSEHOLD_ID","text":"66"}]},{"type":"http://gedcomx.org/RelationshipToHead","values":[{"type":"http://gedcomx.org/Interpreted","labelId":"PR_RELATIONSHIP_TO_HEAD","text":"Head"}]},{"type":"http://familysearch.org/types/fields/RelationshipToHeadCode","values":[{"type":"http://gedcomx.org/Original","labelId":"RELATIONSHIP_CODE","text":"SELF"}]},{"type":"http://familysearch.org/types/fields/CollectionId","values":[{"type":"http://gedcomx.org/Original","labelId":"COLLECTION_ID","text":"1584557"}]},{"type":"http://familysearch.org/types/fields/EventDistrict","values":[{"type":"http://gedcomx.org/Original","labelId":"EVENT_DISTRICT","text":"Hamilton (city/cité)"}]},{"type":"http://familysearch.org/types/fields/EventProvince","values":[{"type":"http://gedcomx.org/Original","labelId":"EVENT_PROVINCE","text":"Ontario"}]},{"type":"http://familysearch.org/types/fields/EventSubDistrict","values":[{"type":"http://gedcomx.org/Original","labelId":"EVENT_SUB_DISTRICT","text":"B"}]},{"type":"http://familysearch.org/types/fields/EventType","values":[{"type":"http://gedcomx.org/Original","labelId":"EVENT_TYPE","text":"Census"}]},{"type":"http://familysearch.org/types/fields/Id","values":[{"type":"http://gedcomx.org/Original","labelId":"ID","text":"z002-z000067618"}]},{"type":"http://familysearch.org/types/fields/Page","values":[{"type":"http://gedcomx.org/Original","labelId":"PAGE","text":"8"}]},{"type":"http://familysearch.org/types/fields/Pid","values":[{"type":"http://gedcomx.org/Original","labelId":"PID","text":"11335440"}]},{"type":"http://familysearch.org/types/fields/PpqId","values":[{"type":"http://gedcomx.org/Original","labelId":"PPQ_ID","text":"08-0278"}]},{"type":"http://familysearch.org/types/fields/PrAgeInYears","values":[{"type":"http://gedcomx.org/Original","labelId":"PR_AGE_IN_YEARS","text":"70"}]},{"type":"http://familysearch.org/types/fields/PrRacialOrTribalOrigin","values":[{"type":"http://gedcomx.org/Original","labelId":"PR_RACIAL_OR_TRIBAL_ORIGIN","text":"Scotch"}]},{"type":"http://familysearch.org/types/fields/RollNumber","values":[{"type":"http://gedcomx.org/Original","labelId":"ROLL_NUMBER","text":"CC1901_47"}]},{"type":"http://familysearch.org/types/fields/SortKey","values":[{"type":"http://gedcomx.org/Interpreted","labelId":"SORT_KEY","text":"z002-z000067618_0000066_11335440_1001080442645"}]}],"url":"https://familysearch.org/ark:/61903/1:1:KH21-F11","personBestName":"John Stuart","localizedGender":"Male","title":"John Stuart, \"Canada Census, 1901\"","personRecordTitle":"John Stuart","metadata":{"bibliographicCitation":"\"Canada Census, 1901,\" , <i>FamilySearch</i> (https://familysearch.org/ark:/61903/1:1:KH21-F11 : accessed 14 August 2015), John Stuart, B, Hamilton (city/cité), Ontario, Canada; citing p. 8, Library and Archives of Canada, Ottawa."},"imageMeta":{"thirdPartyHostName":"","isExternalImage":false,"thirdPartyURL":"","imageURL":"","wikiCollectionURL":"/learn/wiki/en/api.php?action=query&list=search&srwhat=text&format=json&srsearch=CID1584557"}};

我能够从来自标记和指定类的 html 的另一部分(未显示) 中提取名称实体。

# coding=utf-8
import urllib2
import re
import csv
from bs4 import BeautifulSoup
import time
from unicodedata import normalize
Url = "https://familysearch.org/pal:/MM9.1.1/KHR6-D6D"
Page = urllib2.urlopen(Url)
Soup = BeautifulSoup(Page)
Page.close()
x = Soup.find("h3", { "class" : "print-only print-title" })
sx = x.string.encode('utf-8')
k = sx.split(', "Can')
kk = k[0].split(' in household')
name = kk[0]
print name

编辑:

# Get other fields
rawJ = Soup.find_all('script')
J = str(rawJ[10])
J1 = J.split('var person = ')
J2 = J1[1].rsplit('var record =')
J3 = J2[0].rsplit(';', 1)

JsonText = J3[0]
#print JsonText

s = json.loads(JsonText)
print s["personBestName"]

# UnicodeEncodeError: 'ascii' codec can't encode character u'\xe9' in position 5: ordinal not in range(128)

最佳答案

那个长字符串是JSON ,它大致映射到 python 字典。您有键值对,例如分别为 "id""p_14062397399"

所以我美化了JSON here你可以很容易地看到键值对和嵌套结构。要提取姓名和地址,您可以执行以下操作:

from bs4 import BeautifulSoup as bs
from urllib import urlopen
import json

Soup = bs(urlopen('https://familysearch.org/pal:/MM9.1.1/KHR6-D6D').read())

rawJ = Soup.find_all('script')
J = str(rawJ[10])
J1 = J.split('var person = ')
J2 = J1[1].rsplit('var record =')
J3 = J2[0].rsplit(';', 1)

JsonText = J3[0].decode('utf-8')


s = json.loads(JsonText)
print s["personBestName"]
for i in s["facts"]:
    if i["type"] == "http://gedcomx.org/Census":
        print i["place"]["fields"][0]["values"][0]["text"]

关于python-2.7 - 使用 &lt;script&gt; 和 var 从 BeautifulSoup 中提取数据,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/32021142/

相关文章:

python - 从网页上的 anchor 标记访问详细信息

python - 尝试解析表中的表

ruby - 使用 Ruby 抓取图像数据库

python - Beautiful Soup 在每个字符之间添加空格

django - 如何使用 BeautifulSoup 搜索出现在另一个元素之前的元素?

python - Windows,Python27导入错误: cannot import name IncompleteRead

python-2.7 - Python urllib2 不尊重超时

python-2.7 - 如何在 Windows 上为 Python 2 安装 PyQt5?

python - 如何使用 Beautiful Soup 按属性值选择标签

python-2.7 - 如何计算图像的直方图?