python - 使用池追加到数组

标签 python multiprocessing pool

我正在尝试从soccerway.com 抓取数据,并检查该页面是否是一个已完成的游戏/要玩的游戏,每个实例都写入单独的 csv 文件。我正在运行 10,000 个页面,因此使用 Pools 编写了它。但是,我从追加函数中获取空列表,并且无法向 csv 文件写入任何内容。

我尝试直接写入文件而不是列表附加,但这给出了不完整的文件

import requests
from bs4 import BeautifulSoup
import time
import numpy as np
import uuid
import time
from multiprocessing import Pool
import sys, os

fixturesA = []
linksA = []
statsA = []

def parse(url):
    try:
        #print(url)
        delays = [0.25,0.5,0.75,1]
        delay = np.random.choice(delays)
        #time.sleep(delay)
        #r = requests.get(url)
        r = requests.get(url, timeout = 10)
        soup = BeautifulSoup(r.content, "html.parser")
        teams = soup.findAll('h3', attrs = {'class' : 'thick'})
        homeTeam = teams[0].text.strip()
        awayTeam = teams[2].text.strip()
        middle = teams[1].text.strip()
        dds = soup.findAll('dd')
        date = dds[1].text.strip()
        gameWeek = dds[2].text.strip()
        if ':' not in middle:
            middle = middle.split(" - ")
            homeGoals = 0
            awayGoals = 0
            homeGoals = middle[0]
            try:
                awayGoals = middle[1]
            except Exception as e:
                homeGoals = "-1"
                awayGoals = "-1"
            matchGoals = int(homeGoals) + int(awayGoals)
            if(matchGoals >= 0):
                if(int(homeGoals) > 0 and int(awayGoals) > 0):
                    btts = "y"
                else:
                    btts = "n"
                halfTimeScore = dds[4].text.strip().split(" - ")
                firstHalfHomeGoals = halfTimeScore[0]
                firstHalfAwayConc = halfTimeScore[0]
                firstHalfAwayGoals = halfTimeScore[1]
                firstHalfHomeConc = halfTimeScore[1]
                firstHalfTotalGoals = int(firstHalfHomeGoals) + int(firstHalfAwayGoals)
                secondHalfHomeGoals = int(homeGoals) - int(firstHalfHomeGoals)
                secondHalfAwayConc = int(homeGoals) - int(firstHalfHomeGoals)
                secondHalfAwayGoals = int(awayGoals) - int(firstHalfAwayGoals)
                secondHalfHomeConc = int(awayGoals) - int(firstHalfAwayGoals)
                secondHalfTotalGoals = matchGoals - firstHalfTotalGoals

                homeTeamContainers = soup.findAll('div', attrs = {'class' : 'container left'})
                homeTeamStarting = homeTeamContainers[2]
                homeTeamBench = homeTeamContainers[3]
                homeTeamYellows = len(homeTeamStarting.findAll('img', attrs = {'src' : 'https://s1.swimg.net/gsmf/700/img/events/YC.png' })) + len(homeTeamBench.findAll('img', attrs = {'src' : 'https://s1.swimg.net/gsmf/699/img/events/YC.png' }))
                homeTeamReds = len(homeTeamStarting.findAll('img', attrs = {'src' : 'https://s1.swimg.net/gsmf/700/img/events/RC.png' })) + len(homeTeamBench.findAll('img', attrs = {'src' : 'https://s1.swimg.net/gsmf/699/img/events/RC.png' }))
                homeTeamCards = homeTeamYellows + homeTeamReds

                awayTeamContainers = soup.findAll('div', attrs = {'class' : 'container right'})
                awayTeamStarting = awayTeamContainers[2]
                awayTeamBench = awayTeamContainers[3]
               awayTeamYellows = len(awayTeamStarting.findAll('img', attrs = {'src' : 'https://s1.swimg.net/gsmf/700/img/events/YC.png' })) + len(awayTeamBench.findAll('img', attrs = {'src' : 'https://s1.swimg.net/gsmf/699/img/events/YC.png' }))
                awayTeamReds = len(awayTeamStarting.findAll('img', attrs = {'src' : 'https://s1.swimg.net/gsmf/700/img/events/RC.png' })) + len(awayTeamBench.findAll('img', attrs = {'src' : 'https://s1.swimg.net/gsmf/699/img/events/RC.png' }))
                awayTeamCards = awayTeamYellows + awayTeamReds

                matchCards = homeTeamCards + awayTeamCards
                try:
                    iframe = soup.findAll('iframe')
                    iframeSrc = iframe[1]['src']
                    url = 'https://us.soccerway.com/' + iframeSrc
                    c = requests.get(url,timeout = 10)
                    soupC = BeautifulSoup(c.content, "html.parser")

                    cornerContainer = soupC.findAll('td', attrs = {'class' : 'legend left value'})
                    homeCorners = cornerContainer[0].text.strip()
                    awayCornersConc = homeCorners
                    cornerContainer = soupC.findAll('td', attrs = {'class' : 'legend right value'})
                    awayCorners = cornerContainer[0].text.strip()
                    homeCornersConc = awayCorners
                    matchCorners = int(homeCorners) + int(awayCorners)

                    print("Got Score . " + homeTeam + " vs " + awayTeam+" . " + gameWeek )
                    statsA.append(homeTeam + "," + awayTeam  + "," + gameWeek + "," + homeGoals + "," + awayGoals + "," + str(matchGoals) + "," + btts + "," + firstHalfHomeGoals + "," + firstHalfHomeConc + "," + firstHalfAwayGoals + "," + firstHalfAwayConc + "," + str(firstHalfTotalGoals) + "," + str(secondHalfHomeGoals) + "," + str(secondHalfHomeConc) + "," + str(secondHalfAwayGoals) + "," + str(secondHalfAwayConc) + "," + str(secondHalfTotalGoals) + "," + str(homeTeamCards) + "," + str(awayTeamCards) + "," + str(matchCards) + "," + homeCorners + "," + awayCorners + "," + homeCornersConc + "," + awayCornersConc + "," + str(matchCorners)+","+dds[0].text.strip() + "\n")
                    return None
                except Exception as e:
                    print("Got Score no corners. " + homeTeam + " vs " + awayTeam+" . " + gameWeek + " NO FRAME")
                    statsA.append(homeTeam + "," + awayTeam  + "," + gameWeek + "," + homeGoals + "," + awayGoals + "," + str(matchGoals) + "," + btts + "," + firstHalfHomeGoals + "," + firstHalfHomeConc + "," + firstHalfAwayGoals + "," + firstHalfAwayConc + "," + str(firstHalfTotalGoals) + "," + str(secondHalfHomeGoals) + "," + str(secondHalfHomeConc) + "," + str(secondHalfAwayGoals) + "," + str(secondHalfAwayConc) + "," + str(secondHalfTotalGoals) + "," + str(homeTeamCards) + "," + str(awayTeamCards) + "," + str(matchCards) + "," + "" + "," + "" + "," + "" + "," + "" + "," + ""+","+dds[0].text.strip() + "\n")
                    return None
        else:
            fixturesA.append(homeTeam + "," + awayTeam  + "," + gameWeek + "," + date + "\n")
            linksA.append(url + "\n")
            print(homeTeam + " vs " + awayTeam + " at " + middle + " GW:" + gameWeek)
            return None
    except Exception as e:
        exc_type, exc_obj, exc_tb = sys.exc_info()
        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        print(exc_type, fname, exc_tb.tb_lineno)
        linksA.append(url + "\n")
        print(url)
        return None


stats = open('Statsv2.csv','a',encoding='utf-8')
fixtures = open('fixturesv2.csv','w',encoding='utf-8')

with open('links.txt') as f:
    content = f.readlines()
content = [x.strip() for x in content]

links = open('links.txt','w')

if __name__ == '__main__':
    start_time = time.time()
    p = Pool(20)  # Pool tells how many at a time
    records = p.map(parse, content)
    p.terminate()
    p.join()

    print("--- %s seconds ---" % (time.time() - start_time))

最佳答案

我假设您运行的是 Windows?那么答案是 Windows 中的多处理会创建副本而不是 fork 。因此,您拥有包含列表的主流程,并且您的工作流程(从池中)拥有自己单独的一组列表。

工作人员很可能正确填写了列表,但主进程中的列表没有获取任何数据,因此保持为空。 worker 们也没有归还任何东西。因此,当您在主进程中写入文件时,您会得到空文件。

解决此问题的一个简单方法是在主进程和工作线程之间创建管道或队列,以允许线程之间进行通信。您还可以使用共享数组,就像它们由多处理类提供的那样,但您需要在创建过程中知道长度。

参见文档:Multiprocessing

关于python - 使用池追加到数组,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/57437873/

相关文章:

linux - freeRTOS 和并行处理

javascript - 如何使用 getconnection 方法创建 Node js MySQL 池

python - 最新 python 更新后读取长路径(>256 个字符)的文件时出现问题

python - Flask-socketio 在后台线程中复制文件时丢失事件

python - Tkinter——如何水平居中 Canvas 文本?

Python多处理问题

design-patterns - 池和集群的区别

Python 多进程池。当其中一个工作进程确定不需要完成更多工作时如何退出脚本?

python属性样式

python - 如何在不占用所有内存的情况下使用 python-gnupg 加密大型数据集?