html - 当我尝试查询其他国家/地区的es-es时,抓取news.google.com将hl和ceid参数添加为en-us

标签 html go web-scraping

我正在尝试从news.google.com的高处底部刮取查询。我的脚本可以在en-us上正常运行,但是当我尝试添加其他语言环境时,它仍然会提供en-us查询。我的浏览器默认将ceid和hl参数分配为en-us。


    package main

import (
    "encoding/json"
    "fmt"
    "github.com/gocolly/colly"
    "log"
    "net/http"
    "net/url"
    "strings"
)

// difference returns the elements in `a` that aren't in `b`.
func difference(a, b []string) []string {
    mb := make(map[string]struct{}, len(b))
    for _, x := range b {
        mb[x] = struct{}{}
    }
    var diff []string
    for _, x := range a {
        if _, found := mb[x]; !found {
                diff = append(diff, x)
        }
    }
    return diff
}

func APIResponseRequest(locale string) []string{

    var queries []string
    stopWords := []string {"Google apps", "Top stories", "U.S.", "World",
        "Your local news", "Business", "Technology", "Entertainment", "Sports", "Science", "Health",
        "Get the Android app", "Get the iOS app", "Help", "Get perspectives and context", "Google", "",}
    // Instantiate default collector
    c := colly.NewCollector(
        // Visit only domains: news.google.com
        colly.AllowedDomains("news.google.com"),
    )
    // On every a element which has href attribute call callback
    c.OnHTML("a", func(e *colly.HTMLElement) {
        link := e.Attr("aria-label")
        if link != "" {
            fmt.Println("printing queries ", link)
            queries = append(queries, string(link))
        }
    })

    // Before making a request print "Visiting ..."
    c.OnRequest(func(r *colly.Request) {
        r.Headers.Set("ceid", "es:es")
        fmt.Println("Visiting", r.URL.String())
    })

    // Start scraping on news.google.com
    u := "https://news.google.com"
    server, err := url.Parse(u)
    if err != nil {
        panic(err)
    }
    q := server.Query()
    q.Add("pz", "1")
    q.Add("hs",locale)
    q.Add("gl",locale)
    q.Add("ceid", strings.Replace(locale, "-",":", -1))
    server.RawQuery = q.Encode()
    c.Visit(server.String())
    diff_queries := difference(queries, stopWords )
    return diff_queries
}

func newshandler(w http.ResponseWriter, r *http.Request) {
    locale_queryset := make(map[string]interface{})
    countries_supported  := []string{"es-es","de-de"}
    //countries_supported  := []string{"en-us"}
    for _, locale := range(countries_supported){
        queries := APIResponseRequest(locale)
        //fmt.Println(locale, ":",  queries)
        locale_queryset[locale] = queries
    }
    fmt.Println(locale_queryset)
    json.NewEncoder(w).Encode(locale_queryset)
}

func main() {

    http.HandleFunc("/api/gnews", newshandler)
    fmt.Println("Listening at http://localhost:8000/api/gnews")
    log.Fatal(http.ListenAndServe(":8000", nil))
}

我希望抓取的网址是:https://news.google.com?ceid=es%3Aes&gl=es-es&hs=es-es&pz=1
当我单击它时:https://news.google.com/topstories?ceid=US:en&gl=US&hs=es-es&pz=1&hl=en-US
而且我仍然可以从en-us本身那里获得新闻查询。

如何添加标题,仅显示区域设置es-es结果

最佳答案

实际添加hl作为参数并使用语言环境作为值是可行的,我能够从不同的查询中获取新闻查询。

刚刚添加了这一行:q.Add("hl",locale)
我不确定这些参数是什么?有人知道这些是什么意思吗?

关于html - 当我尝试查询其他国家/地区的es-es时,抓取news.google.com将hl和ceid参数添加为en-us,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/61093855/

相关文章:

javascript - 自定义/隐藏分页导航

GOBIN 未设置 : cannot run go install

go - 我在并发方面缺少什么?

android - Jsoup.parse 在 19 种以上的 Android 设备上花费 10 倍以上的时间

python - 使用请求抓取网页不会返回所有数据

javascript - D3js - 强制定向图 - 相邻节点和链接的高级突出显示,这可能吗?

javascript - Javascript自定义视频播放器当前时间/总时间

http - Golang - 检查错误和推迟操作的正确顺序是什么?

python - 无法单击页面 Selenium python 上的元素

javascript - jQuery 和 HTML5 元素