python - 使用 python 请求 session 登录 LinkedIn

标签 python web-scraping linkedin

我正在尝试使用 Python 请求登录 LinkedIn:

import sys
import requests
from BeautifulSoup import BeautifulSoup


payload={
    'session-key' : 'user@email.com',
    'session-password' : 'password'
}

URL='https://www.linkedin.com/uas/login-submit'
s=requests.session()
s.post(URL,data=payload)

r=s.get('http://www.linkedin.com/nhome')
soup = BeautifulSoup(r.text)
print soup.find('title')

我似乎无法使用此方法登录。我什至尝试在有效负载中使用 csrf 等,但 session 不应该为您处理这些问题吗?

注意最后一行:我使用标题来检查我是否已成功登录。(如果我已经登录,我应该看到“欢迎! | LinkedIn”,而不是“世界最大的专业网络 | LinkedIn”

我错过了什么吗?

最佳答案

我修改了我用于大多数基于 Python 的抓取需求的网络抓取模板以满足您的需求。验证它适用于我自己的登录信息。

它的工作方式是模仿浏览器并维护一个存储用户 session 的 cookieJar。也让它与 BeautifulSoup 一起工作。

注意:这是 Python2 版本。我应要求在下面进一步添加了一个有效的 Python3 示例。

import cookielib
import os
import urllib
import urllib2
import re
import string
from BeautifulSoup import BeautifulSoup

username = "user@email.com"
password = "password"

cookie_filename = "parser.cookies.txt"

class LinkedInParser(object):

    def __init__(self, login, password):
        """ Start up... """
        self.login = login
        self.password = password

        # Simulate browser with cookies enabled
        self.cj = cookielib.MozillaCookieJar(cookie_filename)
        if os.access(cookie_filename, os.F_OK):
            self.cj.load()
        self.opener = urllib2.build_opener(
            urllib2.HTTPRedirectHandler(),
            urllib2.HTTPHandler(debuglevel=0),
            urllib2.HTTPSHandler(debuglevel=0),
            urllib2.HTTPCookieProcessor(self.cj)
        )
        self.opener.addheaders = [
            ('User-agent', ('Mozilla/4.0 (compatible; MSIE 6.0; '
                           'Windows NT 5.2; .NET CLR 1.1.4322)'))
        ]

        # Login
        self.loginPage()

        title = self.loadTitle()
        print title

        self.cj.save()


    def loadPage(self, url, data=None):
        """
        Utility function to load HTML from URLs for us with hack to continue despite 404
        """
        # We'll print the url in case of infinite loop
        # print "Loading URL: %s" % url
        try:
            if data is not None:
                response = self.opener.open(url, data)
            else:
                response = self.opener.open(url)
            return ''.join(response.readlines())
        except:
            # If URL doesn't load for ANY reason, try again...
            # Quick and dirty solution for 404 returns because of network problems
            # However, this could infinite loop if there's an actual problem
            return self.loadPage(url, data)

    def loginPage(self):
        """
        Handle login. This should populate our cookie jar.
        """
        html = self.loadPage("https://www.linkedin.com/")
        soup = BeautifulSoup(html)
        csrf = soup.find(id="loginCsrfParam-login")['value']

        login_data = urllib.urlencode({
            'session_key': self.login,
            'session_password': self.password,
            'loginCsrfParam': csrf,
        })

        html = self.loadPage("https://www.linkedin.com/uas/login-submit", login_data)
        return

    def loadTitle(self):
        html = self.loadPage("https://www.linkedin.com/feed/")
        soup = BeautifulSoup(html)
        return soup.find("title")

parser = LinkedInParser(username, password)

2014 年 6 月 19 日更新:添加了对来自主页的 CSRF token 的解析,以便在更新的登录过程中使用。

2015 年 7 月 23 日更新:在此处添加一个 Python 3 示例。基本上需要替换库位置并删除不推荐使用的方法。它的格式不完美或任何东西,但它的功能。很抱歉匆忙的工作。归根结底,原理和步骤都是一样的。

import http.cookiejar as cookielib
import os
import urllib
import re
import string
from bs4 import BeautifulSoup

username = "user@email.com"
password = "password"

cookie_filename = "parser.cookies.txt"

class LinkedInParser(object):

    def __init__(self, login, password):
        """ Start up... """
        self.login = login
        self.password = password

        # Simulate browser with cookies enabled
        self.cj = cookielib.MozillaCookieJar(cookie_filename)
        if os.access(cookie_filename, os.F_OK):
            self.cj.load()
        self.opener = urllib.request.build_opener(
            urllib.request.HTTPRedirectHandler(),
            urllib.request.HTTPHandler(debuglevel=0),
            urllib.request.HTTPSHandler(debuglevel=0),
            urllib.request.HTTPCookieProcessor(self.cj)
        )
        self.opener.addheaders = [
            ('User-agent', ('Mozilla/4.0 (compatible; MSIE 6.0; '
                           'Windows NT 5.2; .NET CLR 1.1.4322)'))
        ]

        # Login
        self.loginPage()

        title = self.loadTitle()
        print(title)

        self.cj.save()


    def loadPage(self, url, data=None):
        """
        Utility function to load HTML from URLs for us with hack to continue despite 404
        """
        # We'll print the url in case of infinite loop
        # print "Loading URL: %s" % url
        try:
            if data is not None:
                response = self.opener.open(url, data)
            else:
                response = self.opener.open(url)
            return ''.join([str(l) for l in response.readlines()])
        except Exception as e:
            # If URL doesn't load for ANY reason, try again...
            # Quick and dirty solution for 404 returns because of network problems
            # However, this could infinite loop if there's an actual problem
            return self.loadPage(url, data)

    def loadSoup(self, url, data=None):
        """
        Combine loading of URL, HTML, and parsing with BeautifulSoup
        """
        html = self.loadPage(url, data)
        soup = BeautifulSoup(html, "html5lib")
        return soup

    def loginPage(self):
        """
        Handle login. This should populate our cookie jar.
        """
        soup = self.loadSoup("https://www.linkedin.com/")
        csrf = soup.find(id="loginCsrfParam-login")['value']
        login_data = urllib.parse.urlencode({
            'session_key': self.login,
            'session_password': self.password,
            'loginCsrfParam': csrf,
        }).encode('utf8')

        self.loadPage("https://www.linkedin.com/uas/login-submit", login_data)
        return

    def loadTitle(self):
        soup = self.loadSoup("https://www.linkedin.com/feed/")
        return soup.find("title")

parser = LinkedInParser(username, password)

关于python - 使用 python 请求 session 登录 LinkedIn,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/18907503/

相关文章:

python - MySQL 选择具有几乎相同值的多行

python - 导入错误 : No module named 'Cython' when installing Cython on mac for darkflow

python - 当我执行 GET 请求时(在 Python 中),我得到了翻译的文本。如何获取英文内容?

python - 如何获取 Mac 应用程序的 Bundle ID?

R - 使用 rvest 包进行抓取

Excel VBA - 从网页中提取数据

android - 在 Android Studio 中设置 LinkedIn SDK 时出错

iphone - 如何在iOS SDK中获取LinkedIn连接/ friend /联系人?

ruby-on-rails - 使用 linkedin api 发布公共(public)消息时出现 "Access to posting shares denied"错误

python - Alpine python 3.7.7 docker:pipenv无法安装psycopg2 2.8.4和pyzmq,而在使用python 3.7.3的主机上安装良好