我正在使用以下脚本对登录 LinkedIn 进行身份验证,然后使用 Beautiful Soup 抓取 HTML。
登录验证没有问题(我看到我的帐户信息)但是当我尝试加载页面时我收到“fs.config({"failureRedirect})”错误。
import cookielib
import os
import urllib
import urllib2
import re
import string
import sys
from bs4 import BeautifulSoup
username = "MY USERNAME"
password = "PASSWORD"
ofile = open('Text_Dump.txt', "wb")
cookie_filename = "parser.cookies.txt"
class LinkedInParser(object):
def __init__(self, login, password):
""" Start up... """
self.login = login
self.password = password
# Simulate browser with cookies enabled
self.cj = cookielib.MozillaCookieJar(cookie_filename)
if os.access(cookie_filename, os.F_OK):
self.cj.load()
self.opener = urllib2.build_opener(
urllib2.HTTPRedirectHandler(),
urllib2.HTTPHandler(debuglevel=0),
urllib2.HTTPSHandler(debuglevel=0),
urllib2.HTTPCookieProcessor(self.cj)
)
self.opener.addheaders = [
('User-agent', ('Mozilla/4.0 (compatible; MSIE 6.0; '
'Windows NT 5.2; .NET CLR 1.1.4322)'))
]
# Login
title = self.loginPage()
sys.stderr.write("Login"+ str(self.login) + "\n")
#title = self.loadTitle()
ofile.write(title)
def loadPage(self, url, data=None):
"""
Utility function to load HTML from URLs for us with hack to continue despite 404
"""
# We'll print the url in case of infinite loop
# print "Loading URL: %s" % url
try:
if data is not None:
response = self.opener.open(url, data)
else:
response = self.opener.open(url)
return ''.join(response.readlines())
except:
# If URL doesn't load for ANY reason, try again...
# Quick and dirty solution for 404 returns because of network problems
# However, this could infinite loop if there's an actual problem
return self.loadPage(url, data)
def loginPage(self):
"""
Handle login. This should populate our cookie jar.
"""
html = self.loadPage("https://www.linkedin.com/")
soup = BeautifulSoup(html)
csrf = soup.find(id="csrfToken-postModuleForm")['value']
login_data = urllib.urlencode({
'session_key': self.login,
'session_password': self.password,
'loginCsrfParam': csrf,
})
html = self.loadPage("https://www.linkedin.com/uas/login-submit", login_data)
return
def loadTitle(self):
html = self.loadPage("https://www.linkedin.com/")
soup = BeautifulSoup(html)
return soup.get_text().encode('utf-8').strip()
parser = LinkedInParser(username, password)
ofile.close()
登录脚本来自: Logging in to LinkedIn with python requests sessions
有什么想法吗?
最佳答案
你的语法错误
首先 - crsf 是一个输入字段而不是 div 标签/检查元素,你会看到
第二 - 要查找具有指定属性和值的标签,您需要使用 .find('type_of_tag' :{'tag_attribute':'value'})
第三种方法是访问指定标签内特定属性的值,您需要使用方括号语法或 .get()
这是您必须替换的代码
html = self.loadPage("https://www.linkedin.com/")
soup = BeautifulSoup(html)
csrf = soup.find('input', {"name" : "csrfToken"})
csrf_token = csrf['value']
print csrf_token
关于python - BeautifulSoup 登录 - 如何获取具有特定属性和值的 crsf 字段,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/28292010/