用于 Target 的 Python 网络抓取工具

标签 python web-scraping

我是一名新手程序员,正在尝试为 Target.com 部署 Python 网络抓取工具。我已经粘贴了下面的代码。

我要解决的问题是,当我运行脚本时,最后没有创建 csv 文件。 Web 浏览器打开并且数据正在运行 sublime text,所以我很困惑为什么 output.csv 没有出现。

非常感谢您的帮助。谢谢!

import requests
import csv
import re
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import html


cats = [
    ('Natural Laundry Detergent','https://www.target.com/c/natural-cleaning-supplies-household-essentials/-/N-4yjz7Z55t1q?Nao=0'),
    ('Natural All-Purpose Cleaner','https://www.target.com/c/natural-cleaning-supplies-household-essentials/-/N-4yjz7Zzag5n?Nao=0'),
    ('Natural Dish Soaps','https://www.target.com/c/natural-cleaning-supplies-household-essentials/-/N-4yjz7Zx6dg5?Nao=0'),
    ('Natural Hair Shampoo','https://www.target.com/c/natural-hair-care/-/N-4smdrZ56ecv?Nao=0'),
    ('Natural Hair Conditioner','https://www.target.com/c/natural-hair-care/-/N-4smdrZv1cqo?Nao=0'),
    ('Natural Body Wash','https://www.target.com/c/natural-personal-care/-/N-4smdpZ5td3p?Nao=0'),
    ('Baby Shampoo and Body Wash','https://www.target.com/c/baby-toiletries-bath-potty/-/N-5xtjdZ54wt4?Nao=0'),
    ('Baby Bath Wash' ,'https://www.target.com/c/baby-toiletries-bath-potty/baby-bath-wash/-/N-5xtjdZ5ri3m'),
    ('Baby Bubble Bath' ,'https://www.target.com/c/baby-toiletries-bath-potty/-/N-5xtjdZ5t3hx?Nao=0'),
    ('Stain Removers', 'https://www.target.com/s?searchTerm=stain+remover&facetedValue=56cpg&Nao=0'),
    ('Baby Lotions', 'https://www.target.com/c/baby-toiletries-bath-potty/baby-lotions/-/N-5xtjdZ5vg2t'),
    ('Tampons','https://www.target.com/c/tampons-feminine-products-personal-care/-/N-4y634'),
    ('Maxi Pads','https://www.target.com/c/maxi-pads-feminine-products-personal-care/-/N-4y633'),
    ('Feminine Hygiene','https://www.target.com/c/feminine-hygiene-products-personal-care/-/N-4y631'),
]


class TargetClient(object):
    def __init__(self):
        self.wd = webdriver.Chrome(executable_path=r'C:\Users\wquar\AppData\Local\Programs\Python\Python37\chromedriver.exe')
        self.base_url = 'https://www.target.com'

    def gather_product_links(self):
        soup = BeautifulSoup(self.wd.page_source)
        divs = soup.select('div[class*="ProductCardImageWrapper"]')
        links = [self.base_url + d.a['href'] for d in divs]
        return links

    def goto_next_page(self):
        ele = self.wd.find_element_by_xpath("//a[@aria-label='next page']")
        ele.click()
        time.sleep(1.5)


    def _format_product_name(self,input):
        out = input.replace('®','').replace('\x99','')
        return out

    def _format_brand_name(self,input):
        out = input.replace('®','').replace('\x99','')
        out = html.unescape(out)

        if out == "Murphy's":
            out = 'Murphy'
        elif out == 'ECOS by Earth Friendly Products':
            out = 'Ecos'
        elif out == 'Eden Body Works':
            out = 'EDEN BodyWorks'
        elif out == 'BRÖÖ':
            out = 'BRöö'
        elif out == 'Love Beauty & Planet':
            out = 'Love Beauty And Planet'
        elif out == 'Hask':
            out = 'HASK'
        elif out == 'Palmers':
            out = "Palmer's"
        elif out == 'MacVoil':
            out = "Macvoil"
        elif out == 'Dear Clark,':
            out = "Dear Clark"
        elif out == 'Earth Science Naturals':
            out = "Earth Science"
        elif out == 'PAW Patrol':
            out = "Paw Patrol"
        elif out == 'up & up™':
            out = "Up&Up"
        elif out == 'Johnson & Johnson':
            out = "Johnson's"
        elif out == 'Earth Mama Angel Baby':
            out = "Earth Mama"
        elif out == 'Mielle Organics':
            out = "Mielle"
        elif out == 'EveryDay Coconut':
            out = "Alaffia"
        elif out == 'Olivina':
            out = "OLIVINA MEN"
        elif out == 'AVALON':
            out = "Avalon"
        elif out == 'Oxi Clean':
            out = "OxiClean"
        elif out == 'Village Naturals':
            out = "Nourishing Naturals"
        elif out == 'everyone':
            out = "everyone"
        elif out == 'Savannah Bee Company':
            out = 'Savannah Bee'
        elif out == 'Camille Rose Natural':
            out = 'Camille Rose'

        return out

    def _get_product_name(self, complete_product_name, brand_name):
        if brand_name == 'Alaffia':
            return complete_product_name.split(' -')[0].strip()
        elif brand_name == 'SoCozy' and 'So Cfl ozy' in complete_product_name:
            return complete_product_name.split('So Cfl ozy')[1].split(' -')[0].strip()
        elif brand_name == 'Ecos' and 'ECOS' in complete_product_name:
            return complete_product_name.split('ECOS')[1].split(' -')[0].strip()
        elif brand_name == 'Clorox 2' and 'Clorox2' in complete_product_name:
            return complete_product_name.split('Clorox2')[1].split(' -')[0].strip()

        product_name = complete_product_name.split(brand_name)[1].split(' -')[0].strip()
        return product_name

    def scrape_product_page(self, url, category):
        r = requests.get(url)
        soup = BeautifulSoup(r.content)
        d = {}

        try:
            complete_product_name = soup('span',attrs={'data-test':'product-title'})[0].text
        except:
            print('ERROR')
            return None

        complete_product_name = self._format_product_name(complete_product_name)

        print(complete_product_name)

        brand_name = soup.select('div[class*="ProductDetailsTitle"]')[0].text.split('Shop all')[-1].strip()
        brand_name = self._format_brand_name(brand_name)

        d['Brand'] = brand_name

        #return (complete_product_name, brand_name)

        try:
            product_name = self._get_product_name(complete_product_name,brand_name)
        except:
            print('PRODUCT ERROR')
            print('PRODUCT ERROR')
            return None

        d['Product'] = product_name

        try:
            d['Capacity'] = soup('b',text='Capacity (Volume):')[0].next.next.strip()
        except:
            d['Capacity'] = self._parse_capacity_from_title(complete_product_name)

        try:
            d['Scent'] = soup('b',text='Scent:')[0].next.next.strip()
        except:
            d['Scent'] = ''

        try:
            d['Price'] = soup('div',attrs={'data-test':'product-price'})[0].span.text
        except:
            d['Price'] = ''

        try:
            d['Product Form'] = soup('b',text='Product Form:')[0].next.next.strip()
        except:
            d['Product Form'] = ''

        try:
            star_rating =soup('div',attrs={'data-ref':'rating-mask'})[0].attrs['style'].split('width:')[1]
            d['Star Rating'] = round(float(star_rating.split('%')[0]) / 20, 2)
        except:
            d['Star Rating']=''

        try:
            d['Number of Ratings'] = soup('span',attrs={'data-test':'ratingCount'})[0].text.strip()
            if d['Number of Ratings'] == 'be the first!':
                d['Number of Ratings'] = 0
        except:
            d['Number of Ratings'] = ''

        try:
            d['Health Facts'] = soup('b',text='Health Facts:')[0].next.next.strip()
        except:
            d['Health Facts'] = ''

        try:
            d['Features'] = soup('b',text='Features:')[0].next.next.strip()
        except:
            d['Features'] = ''

        try:
            d['Wellness Standard'] = soup('b',text='Wellness Standard:')[0].next.next.strip()
        except:
            d['Wellness Standard'] = ''

        try:
            d['Sustainability Claims'] = soup('b',text='Sustainability Claims:')[0].next.next.strip()
        except:
            d['Sustainability Claims'] = ''


        try:
            d['Number of Uses'] = soup('b',text='Number of uses:')[0].next.next.strip()
        except:
            d['Number of Uses'] = self._parse_num_uses_from_title(complete_product_name)


        try:
            d['UPC Code'] = soup('b',text='UPC')[0].next.next.next.next.strip()
        except:
            d['UPC Code'] = ''

        d['URL'] = url
        d['Category'] = category
        d['Package Quantity'] = self._parse_quant_from_title(complete_product_name)

        return d

    def _parse_capacity_from_title(self,input):
        m = re.search('\d+(\.\d)? ?(fl)? ?oz',input,re.IGNORECASE)

        if m:
            return m.group()
        return ''

    def _parse_quant_from_title(self,input):
        m = re.search('\d+ ?pk',input)

        if m:
            return m.group().split('pk')[0].strip()
        return 1

    def _parse_num_uses_from_title(self,input):
        m = re.search('\d+ ?ct',input)
        if m:
            return m.group().split('ct')[0]
        return ''

    def scrape_cat(self, cat_name, url):
        h = []
        self.wd.get(url)
        links = self.gather_all_product_links()
        for l in links:
            print (l)
            res = self.scrape_product_page(l, cat_name)
            h.append(res)
        return h

    def gather_all_product_links(self):
        links = self.gather_product_links()
        while True:
            try:
                self.goto_next_page()
                links.extend(self.gather_product_links())
            except:
                return [l for l in list(set(links)) if '-category-' not in l]

def main():
    h = []
    targ = TargetClient()
    for cat_name, url in cats:
        data = targ.scrape_cat(cat_name, url)
        h.extend(data)
    return h
    write_csv(data)

def write_csv(data):
    data = [x for x in data if x]
    f = open('output.csv','w')
    fields = ['Category','Brand', 'Product', 'Scent', 'Price','Package Quantity','Product Form', 'Capacity', 'Number of Uses', 'Star Rating', 'Number of Ratings', 'Health Facts', 'Features', 'Wellness Standard', 'Sustainability Claims', 'UPC Code', 'URL'] 
    dw = csv.DictWriter(f,fieldnames=fields)
    dw.writeheader()
    dw.writerows(data)


if __name__ == '__main__':
    main()

最佳答案

您必须在返回 main() 函数之前调用 write_csv。

像那样:

def main():
    h = []
    targ = TargetClient()
    for cat_name, url in cats:
        data = targ.scrape_cat(cat_name, url)
        h.extend(data)
        write_csv(data)        
    return h

希望对您有所帮助。

关于用于 Target 的 Python 网络抓取工具,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/52742399/

相关文章:

python - 使用 PPDB 生成英文文本的释义

python - 当你不需要 python 中的索引时,你如何创建一个 For 循环?

json - 使用 R 进行网页抓取,使用 Jsonlite 解决方案似乎不稳定

python - 我如何开始使用 python 在 web Scrapy 中编写单元测试?

python - 使用 BeautifulSoup 进行网页抓取时出错

python - 用户 :pass proxies with selenium

python - 从 Apache 运行 python 脚本的最简单方法

python - 类型错误 : unhashable type: 'list' (a few frames into program)

r - gsub() 无法识别和替换某些重音字符

javascript - 网页上的谷歌广告可以从当前页面抓取内容吗?