我正在尝试从news.google.com的高处底部刮取查询。我的脚本可以在en-us上正常运行,但是当我尝试添加其他语言环境时,它仍然会提供en-us查询。我的浏览器默认将ceid和hl参数分配为en-us。
package main
import (
"encoding/json"
"fmt"
"github.com/gocolly/colly"
"log"
"net/http"
"net/url"
"strings"
)
// difference returns the elements in `a` that aren't in `b`.
func difference(a, b []string) []string {
mb := make(map[string]struct{}, len(b))
for _, x := range b {
mb[x] = struct{}{}
}
var diff []string
for _, x := range a {
if _, found := mb[x]; !found {
diff = append(diff, x)
}
}
return diff
}
func APIResponseRequest(locale string) []string{
var queries []string
stopWords := []string {"Google apps", "Top stories", "U.S.", "World",
"Your local news", "Business", "Technology", "Entertainment", "Sports", "Science", "Health",
"Get the Android app", "Get the iOS app", "Help", "Get perspectives and context", "Google", "",}
// Instantiate default collector
c := colly.NewCollector(
// Visit only domains: news.google.com
colly.AllowedDomains("news.google.com"),
)
// On every a element which has href attribute call callback
c.OnHTML("a", func(e *colly.HTMLElement) {
link := e.Attr("aria-label")
if link != "" {
fmt.Println("printing queries ", link)
queries = append(queries, string(link))
}
})
// Before making a request print "Visiting ..."
c.OnRequest(func(r *colly.Request) {
r.Headers.Set("ceid", "es:es")
fmt.Println("Visiting", r.URL.String())
})
// Start scraping on news.google.com
u := "https://news.google.com"
server, err := url.Parse(u)
if err != nil {
panic(err)
}
q := server.Query()
q.Add("pz", "1")
q.Add("hs",locale)
q.Add("gl",locale)
q.Add("ceid", strings.Replace(locale, "-",":", -1))
server.RawQuery = q.Encode()
c.Visit(server.String())
diff_queries := difference(queries, stopWords )
return diff_queries
}
func newshandler(w http.ResponseWriter, r *http.Request) {
locale_queryset := make(map[string]interface{})
countries_supported := []string{"es-es","de-de"}
//countries_supported := []string{"en-us"}
for _, locale := range(countries_supported){
queries := APIResponseRequest(locale)
//fmt.Println(locale, ":", queries)
locale_queryset[locale] = queries
}
fmt.Println(locale_queryset)
json.NewEncoder(w).Encode(locale_queryset)
}
func main() {
http.HandleFunc("/api/gnews", newshandler)
fmt.Println("Listening at http://localhost:8000/api/gnews")
log.Fatal(http.ListenAndServe(":8000", nil))
}
我希望抓取的网址是:https://news.google.com?ceid=es%3Aes&gl=es-es&hs=es-es&pz=1
当我单击它时:https://news.google.com/topstories?ceid=US:en&gl=US&hs=es-es&pz=1&hl=en-US
而且我仍然可以从en-us本身那里获得新闻查询。
如何添加标题,仅显示区域设置es-es结果
最佳答案
实际添加hl作为参数并使用语言环境作为值是可行的,我能够从不同的查询中获取新闻查询。
刚刚添加了这一行:q.Add("hl",locale)
我不确定这些参数是什么?有人知道这些是什么意思吗?
关于html - 当我尝试查询其他国家/地区的es-es时,抓取news.google.com将hl和ceid参数添加为en-us,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/61093855/