python - BeautifulSoup 文本挖掘 - 变量字符串

标签 python html web-scraping beautifulsoup scrapy

我正在尝试从 Indeed.com 中提取文本,但由于某种原因无法从如下所示的 jobmap 变量中提取文本。我已包含我的代码供您查看和指导。我使用的原始链接是http://www.indeed.co.uk/jobs?q=python&l= .

我希望创建一个从 jobmap[0]jobmap[9] 的数据框:

<script type="text/javascript">

function rclk(el,jobdata,oc,sal) { var ocstr = oc ? '&onclick=1' : ''; document.cookie='RCLK="jk='+jobdata.jk+'&tk=19i8hio29173i4q9&rd='+jobdata.rd+'&qd=7tdTJLF8oc4dPpT7T_zGvNMUkEhdsofXi_d_0hd2X6v0K0UAGbvReB0EpTyqsy6mfwDp0dterPbHZubI-Ho6fr5IbNHuDaBjQT6u6eGSKV6XZjJx0CQssKb7HhgrPx5f&ts=1428363501641&sal='+sal+ocstr+'"; path=/'; return true;}
function zrprclk(el,jobdata,oc) { var ocstr = oc ? '&onclick=1' : ''; document.cookie='RCLK="jk='+jobdata.jk+'&tk=19i8hio29173i4q9&from=reconzrp&rd='+jobdata.rd+'&qd=7tdTJLF8oc4dPpT7T_zGvNMUkEhdsofXi_d_0hd2X6v0K0UAGbvReB0EpTyqsy6mfwDp0dterPbHZubI-Ho6fr5IbNHuDaBjQT6u6eGSKV6XZjJx0CQssKb7HhgrPx5f&ts=1428363501641'+ocstr+'"; path=/'; return true;}
function prjbottomclk(el,jobdata,oc) { var ocstr = oc ? '&onclick=1' : ''; document.cookie='RCLK="jk='+jobdata.jk+'&tk=19i8hio29173i4q9&from=reconserp&rd='+jobdata.rd+'&qd=7tdTJLF8oc4dPpT7T_zGvNMUkEhdsofXi_d_0hd2X6v0K0UAGbvReB0EpTyqsy6mfwDp0dterPbHZubI-Ho6fr5IbNHuDaBjQT6u6eGSKV6XZjJx0CQssKb7HhgrPx5f&ts=1428363501641'+ocstr+'"; path=/'; return true;}

var jobmap = {};

jobmap[0]= {jk:'833b3b546fa19a15',efccid: 'ba27a1a49bded3ca',srcid:'bd5b1a0b89fdc77a',cmpid:'1c61cbd342c70437',num:'0',srcname:'ustwo studio Ltd',cmp:'ustwo studio Ltd',cmpesc:'ustwo studio Ltd',cmplnk:'/ustwo-studio-jobs',loc:'London',country:'GB',zip:'',city:'London',title:'Data Scientist',locid:'833c779eabe84c9f',rd:'2G0bcbLxcAqiHB9MMTYN9Q'};

jobmap[1]= {jk:'bf6df27f1d3b90fb',efccid: '98f3e203ab7d8e01',srcid:'b0a70c53f51e95a6',cmpid:'fe8b4fdb8a17a513',num:'1',srcname:'Reed Business Information',cmp:'Reed Business Information',cmpesc:'Reed Business Information',cmplnk:'/Reed-Business-Information-jobs',loc:'Heathrow',country:'GB',zip:'',city:'Heathrow',title:'Data Analytics Manager - Flightglobal - Heathrow, Middlesex',locid:'4296d6706ebc67b5',rd:'4ZrZ-vtiYwdobVTLuwlSBHEwqdD0vnOb9P51Phyha6c'};

jobmap[2]= {jk:'146969d233b25b49',efccid: '2a58d847c3011c18',srcid:'b4a49235193125a8',cmpid:'1544766d4c2915b0',num:'2',srcname:'EY',cmp:'EY',cmpesc:'EY',cmplnk:'/EY-jobs',loc:'London',country:'GB',zip:'',city:'London',title:'Analytics Manager - People Data',locid:'833c779eabe84c9f',rd:'WPdCTYq1ZBHM1poxVAfv11_MKnaSAFGAsD6kfERFt3g'};
...
...
...
...
...
...
jobmap[9]=

</script>

我的代码是:

from bs4 import BeautifulSoup
import urllib2
import csv
import os
import re
import requests

page1 = urllib2.urlopen('http://www.indeed.co.uk/jobs?q=%22data+science%22')
soup = BeautifulSoup(page1)


for title in soup.findAll('h2',{'class' : 'jobtitle'}):
    print title.text


for company in soup.findAll('span',{'class' : 'company'}):
    print company.text

最佳答案

这里的想法是找到具有所需对象定义的 script 元素,并使用正则表达式查找所有 jobmap 对象定义,然后使用 demjson module将它们转换为字典。结果你会得到一个字典列表:

from pprint import pprint
import re
import urllib2

from bs4 import BeautifulSoup
import demjson


page1 = urllib2.urlopen('http://www.indeed.co.uk/jobs?q=%22data+science%22')
soup = BeautifulSoup(page1)

pattern = re.compile(r"jobmap\[\d+\]= (.*?);")
script = soup.find('script', text=lambda text: text and "jobmap = {}" in text).text

data = [demjson.decode(item) for item in pattern.findall(script)]
pprint(data)

打印字典列表:

[{u'city': u'London',
  u'cmp': u'ustwo studio Ltd',
  u'cmpesc': u'ustwo studio Ltd',
  u'cmpid': u'1c61cbd342c70437',
  u'cmplnk': u'/ustwo-studio-jobs',
  u'country': u'GB',
  u'efccid': u'ba27a1a49bded3ca',
  u'jk': u'833b3b546fa19a15',
  u'loc': u'London',
  u'locid': u'833c779eabe84c9f',
  u'num': u'0',
  u'rd': u'2G0bcbLxcAqiHB9MMTYN9Q',
  u'srcid': u'bd5b1a0b89fdc77a',
  u'srcname': u'ustwo studio Ltd',
  u'title': u'Data Scientist',
  u'zip': ''},
  ...
 {u'city': u'Belfast',
  u'cmp': u'Allstate Northern Ireland',
  u'cmpesc': u'Allstate Northern Ireland',
  u'cmpid': u'bd6c20d6c99988f6',
  u'cmplnk': u'/Allstate-Northern-Ireland-jobs',
  u'country': u'GB',
  u'efccid': u'521645e5cd22988a',
  u'jk': u'9b517e0b09e09ca0',
  u'loc': u'Belfast',
  u'locid': u'e6523dbdeffe6c9b',
  u'num': u'9',
  u'rd': u'hW5WLDedIUk_fnMJS2cPmngDVkFzbh8-xI2u2vwcbH0',
  u'srcid': u'eb73601b9a76cd58',
  u'srcname': u'Allstate Northern Ireland',
  u'title': u'Big Data Analytics',
  u'zip': ''}]

关于python - BeautifulSoup 文本挖掘 - 变量字符串,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/29482633/

相关文章:

python - 在pyQt中的qlistWidget中读取.csv文件

python - Django 在已经登录时显示 Auth 登录页面

HTML CSS 文本不会居中

javascript - 使用 CasperJS 接受 cookie 策略

python - 有没有办法使用 BeautifulSoup 将数据从列表正确转换为 CSV 文件?

python - 如何引用对象自己的容器

python - 当原始类的实例由另一个类的方法生成时,如何处理子类?

html - Bootstrap 下拉菜单出现在页面底部

html - Outlook CSS 渲染与浏览器不同

python - 使用Python从网页中提取图像链接