python - 下载大约 15000 个 URL 的元内容 Python - 线程

标签 python python-3.x multithreading asynchronous

我的 csv 中有大约 30000 个网址。我需要检查每个网址是否存在元内容。我使用 request_cache 来缓存对 sqlite 数据库的响应。即使使用缓存系统也需要大约 24 小时。因此我转向并发。我认为我在 out = executor.map(download_site,sites, headers) 上做错了。并且不知道如何修复它。

属性错误:“str”对象没有属性“items”

import concurrent.futures
import requests
import threading
import time
import pandas as pd
import requests_cache
from PIL import Image
from io import BytesIO

thread_local = threading.local()

df = pd.read_csv("test.csv")

sites = []
for row in df['URLS']:
    sites.append(row)

# print("URL is shortened")

user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
headers={'User-Agent':user_agent,}

requests_cache.install_cache('network_call', backend='sqlite', expire_after=2592000)


def getSess():
    if not hasattr(thread_local, "session"):
        thread_local.session = requests.Session()
    return thread_local.session

def networkCall(url, headers):
    print("In Download site")
    session = getSess()
    with session.get(url, headers=headers) as response:
        print(f"Read {len(response.content)} from {url}")
        return response.content

out = []
def getMeta(meta_res):
    print("Get data")
    for each in meta_res:
        meta = each.find_all('meta')
        for tag in meta:
            if 'name' in tag.attrs.keys() and tag.attrs['name'].strip().lower() in ['description', 'keywords']:
                content = tag.attrs['content']
                if content != '':
                    out.append("Absent")
                else:
                    out.append("Present")
    return out


def allSites(sites):
    with concurrent.futures.ThreadPoolExecutor(max_workers=50) as executor:
        out = executor.map(networkCall, sites, headers)
        return list(out)


if __name__ == "__main__":
    sites = [
    "https://www.jython.org",
    "http://olympus.realpython.org/dice",
    ] * 15000
    start_time = time.time()
    list_meta = allSites(sites)
    print("META   ", list_meta)
    duration = time.time() - start_time
    print(f"Downloaded {len(sites)} in {duration} seconds")
    output = getMeta(list_meta)
    df["is it there"] = pd.Series(output)
    df.to_csv('new.csv',index=False, header=True)

最佳答案

我试图模仿你的功能。以下代码在 4 分钟内执行:-

from bs4 import BeautifulSoup as BS
import concurrent.futures
import time
import queue
import requests


URLs = [
    "https://www.jython.org",
    "http://olympus.realpython.org/dice"
] * 15_000

user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
headers = {'User-Agent': user_agent}


class SessionCache():
    def __init__(self, cachesize=20):
        self.cachesize = cachesize
        self.sessions = 0
        self.q = queue.Queue()

    def getSession(self):
        try:
            return self.q.get(block=False)
        except queue.Empty:
            pass
        if self.sessions < self.cachesize:
            self.q.put(requests.Session())
            self.sessions += 1
        return self.q.get()

    def putSession(self, session):
        self.q.put(session)


CACHE = SessionCache()


def doGet(url):
    try:
        session = CACHE.getSession()
        response = session.get(url, headers=headers)
        response.raise_for_status()
        soup = BS(response.text, 'lxml')
        for meta in soup.find_all('meta'):
            if (name := meta.attrs.get('name', None)):
                if name.strip().lower() in ['description', 'keywords']:
                    if meta.attrs.get('content', '') != '':
                        return url, 'Present'
        return url, 'Absent'
    except Exception as e:
        return url, str(e)
    finally:
        CACHE.putSession(session)


def main():
    start = time.perf_counter()
    with concurrent.futures.ThreadPoolExecutor() as executor:
        for r in executor.map(doGet, URLs):
            print(f'{r[0]} -> {r[1]}')
    end = time.perf_counter()
    print(f'Duration={end-start:.4f}s')


if __name__ == '__main__':
    main()

关于python - 下载大约 15000 个 URL 的元内容 Python - 线程,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/69281148/

相关文章:

python - 在 Python 中访问 Google Drive API,无需 google-api-python-client(已安装的应用程序)

python - Itertools.product 引发 "Error in argument"

c - 如何判断两个线程是否访问了同一 block 内存?

python - 为什么我不能将 str 列表转换为 float 列表?

python - 如何在 Scrapy 中 'pause' 蜘蛛?

python - Django 亚马逊 S3 Heroku。连接被对端重置。删除对 collectstatic 脚本的确认

python-3.x - 如何使用 Gekko 释放变量

java - 限制迭代器的线程和/或重用线程?

Java volatile 变量多线程行为

python - Plotly:如何在烛台图表中的日期之间绘制垂直线?