node.js - 无法使用 Node 请求模块在 Linux 上读取 LinkedIn 内容

标签 node.js ubuntu request node-request

我正在使用 Node 请求模块来读取网站内容。当我在 Linux 上使用 Node 时,我无法获取 linkedin.com 的完整内容,但它在 Windows 和 Mac OS X 上运行良好。

我编写了以下代码:

var request = require('request')
request('https://www.linkedin.com/pulse/social-media-why-its-essential-tool-oliver-bussmann', function (error, response, body) {
    if (!error && response.statusCode == 200) {
        console.log(body)
    } else {
        // always return this response  
        console.log(response.statusCode,body)
    });

我收到 999 状态代码和以下 HTML 内容:

\n\nwindow.onload = function() {\n // Parse the tracking code from cookies.\n var trk = "sentinel_org_block";\n var cookies = document.cookie.split("; ");\n for (var i = 0; i < cookies.length; ++i) {\n if ((cookies[i].indexOf("trkCode=") == 0) && (cookies[i].length > 8)) {\n trk = cookies[i].substring(8);\n }\n }\n\n // Get the protocol for the redirect url.\n var protocol = "http:";\n if (window.location.protocol == "https:") {\n protocol = "https:";\n } else {\n // If "sl" cookie is set, redirect to https.\n for (var i = 0; i < cookies.length; ++i) {\n if ((cookies[i].indexOf("sl=") == 0) && (cookies[i].length > 3)) {\n window.location.href = "https:" + window.location.href.substring(window.location.protocol.length);\n return;\n }\n }\n }\n\n // Get the new domain. For touch.www.linkedin.com or tablet.www.linkedin.com\n // we strip "touch." and "tablet.". For international domains such as\n // fr.linkedin.com, we convert it to www.linkedin.com\n var domain = location.host;\n if (domain.substr(0, 6) == "touch.") {\n domain = domain.substr(6);\n } else if (domain.substr(0, 7) == "tablet.") {\n domain = domain.substr(7);\n } else if (domain.charAt(2) == ".") {\n domain = "www" + domain.substr(2);\n }\n \n window.location.href = "https://" + domain + "/uas/login?trk=" + trk + "&session_redirect=" +\n encodeURIComponent(protocol + "//" + domain +\n window.location.href.substr(window.location.href.search(window.location.host) +\n window.location.host.length));\n}\n\n\n

我做错了什么?

最佳答案

当我尝试在 Mac OS X 计算机上使用 Node.js 程序访问 LinkedIn 个人资料时,遇到了同样的问题。这是带有缩进的代码,以便更好地理解:

window.onload = function() {
     // Parse the tracking code from cookies.
     var trk = "sentinel_org_block";
     var cookies = document.cookie.split("; ");
     for (var i = 0; i < cookies.length; ++i) {
        if ((cookies[i].indexOf("trkCode=") == 0) && (cookies[i].length > 8)) {
            trk = cookies[i].substring(8);  
        } 
     }
     // Get the protocol for the redirect url.
     var protocol = "http:";
     if (window.location.protocol == "https:") { 
        protocol = "https:"; 
     } else {
      // If "sl" cookie is set, redirect to https.
        for (var i = 0; i < cookies.length; ++i) {
             if ((cookies[i].indexOf("sl=") == 0) && (cookies[i].length > 3)) {
                 window.location.href = "https:" + window.location.href.substring(window.location.protocol.length);
                  return;
            }
        }
    }
    // Get the new domain. For touch.www.linkedin.com or tablet.www.linkedin.com 
    // we strip "touch." and "tablet.". For international domains such as 
    // fr.linkedin.com, we convert it to www.linkedin.com
    var domain = location.host;
    if (domain.substr(0, 6) == "touch.") {
        domain = domain.substr(6);
    } else if (domain.substr(0, 7) == "tablet.") {
        domain = domain.substr(7);
    } else if (domain.charAt(2) == ".") {
        domain = "www" + domain.substr(2);
    }
    window.location.href = "https://" + domain + "/uas/login?trk=" + trk + "&session_redirect=" + encodeURIComponent(protocol + "//" + domain + window.location.href.substr(window.location.href.search(window.location.host) + window.location.host.length));
}

这似乎是一个脚本,用于阻止自动化程序的连接,并在登录页面上重定向用户。它搜索存储在 cookie 中的名为“sl”的变量。如果收到,则会重定向到正确的网页。但如果您不这样做,脚本将不允许您查看此页面,并将您重定向到 LinkedIn 登录页面。 这就是我从这段代码中可以理解的,但不幸的是我无法解决这个问题......

编辑:我已经能够通过使用 PhantomJS 访问该页面来解决我的问题。这样您就可以修改您的用户代理,这样 LinkedIn 就不会阻止您的连接。 这是我使用的代码:

var phantom = require('phantom');
    var sitepage = null;
    var phInstance = null;

    phantom
        .create()
        .then(function(instance) {
            phInstance = instance;
            return instance.createPage();
        })
        .then(function(page) {
            sitepage = page;
            page.setting('userAgent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.84 Safari/537.36');
            return page.open(this.url);
        })
        .then(function(status) {
            console.log(status)
            return sitepage.property('content');
        })
        .then(function (body) {
            console.log(body);
            sitepage.close();
            phInstance.exit();
        })
        .catch(function(err) {
            console.log(err);
            phInstance.exit();
        });

关于node.js - 无法使用 Node 请求模块在 Linux 上读取 LinkedIn 内容,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/37588670/

相关文章:

python - 获取请求期间的字节数

C# (429) 请求过多

node.js - Keystone JS CORS

node.js - 使用 spawnSync 流式传输子进程的输出,但将输出保存到父变量

mysql - 从时间戳获取本地时间

node.js - 不能在 Node 中说 Hello World

python - supervisord 没有这样的过程 vaultier

node.js - axios post套接字仅在docker中挂起

javascript - 为 Firebase 存储中的文件创建 readStream

ubuntu - Pgadmin4 : Error module 'paramiko' has no attribute 'Ed25519Key'