python - 使用 pandas 写入 sql 数据库

标签 python mysql sql pandas sqlalchemy

困惑。尝试用 python 构建英国新闻爬虫。

import feedparser
import pandas as pd

def poll_rss(rss_url):
    feed = feedparser.parse(rss_url)
    for entry in feed.entries:
        print("Title:", entry.title)
        print("Description:", entry.description)
        print("\n")

def poll_rss(rss_url):
    feed = feedparser.parse(rss_url)
    for entry in feed.entries:
        print("Title:", entry.title)
        print("Description:", entry.description)
        print("\n")

# Example usage:
feeds = [{"type": "news","title": "BBC", "url": "http://feeds.bbci.co.uk/news/uk/rss.xml"},
        {"type": "news","title": "The Economist", "url": "https://www.economist.com/international/rss.xml"},    
        {"type": "news","title": "The New Statesman", "url": "https://www.newstatesman.com/feed"},    
        {"type": "news","title": "The New York Times", "url": "https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml"},
        {"type": "news","title": "Metro UK","url": "https://metro.co.uk/feed/"},
        {"type": "news", "title": "Evening Standard", "url": "https://www.standard.co.uk/rss.xml"},
        {"type": "news","title": "Daily Mail", "url": "https://www.dailymail.co.uk/articles.rss"},
        {"type": "news","title": "Sky News", "url": "https://news.sky.com/feeds/rss/home.xml"},
        {"type": "news", "title": "The Mirror", "url": "https://www.mirror.co.uk/news/?service=rss"},
        {"type": "news", "title": "The Sun", "url": "https://www.thesun.co.uk/news/feed/"},
        {"type": "news", "title": "Sky News", "url": "https://news.sky.com/feeds/rss/home.xml"},
        {"type": "news", "title": "The Guardian", "url": "https://www.theguardian.com/uk/rss"},
        {"type": "news", "title": "The Independent", "url": "https://www.independent.co.uk/news/uk/rss"},
        {"type": "news", "title": "The Telegraph", "url": "https://www.telegraph.co.uk/news/rss.xml"},
        {"type": "news", "title": "The Times", "url": "https://www.thetimes.co.uk/?service=rss"},
        {"type": "news", "title": "The Mirror", "url": "https://www.mirror.co.uk/news/rss.xml"}]

for feed in feeds:
    parsed_feed = feedparser.parse(feed['url'])
    
    print("Title:", feed['title'])
    print("Number of Articles:", len(parsed_feed.entries))
    print("\n")
    data = []
    for entry in parsed_feed.entries:
        title = entry.title
        url = entry.link
        print(entry.summary)
        if entry.summary:
            summary = entry.summary
            data.append(summary)
        else:
            entry.summary = "No summary available"
        if entry.published:
            date = entry.published
            data.append (data)
        else:
            data.append("No data available")

然后我有一些代码来整理保存。

df = pd.DataFrame(data)
df.columns = ['title', 'url', 'summary', 'date']
print("data" + df)
from sqlalchemy import create_engine
import mysql.connector
engine = create_engine('mysql+pymysql://root:password_thingbob@localhost/somedatabase')  
df.to_sql('nationals', con = engine, if_exists = 'append', index = False)

虽然nationals表已经创建,凭证也正确,但为什么不保存?

最佳答案

如果凭据如您所说正确,则 to_sql 调用就可以。我认为问题在于解析 feed 的 Python 循环。特别是,data.append (data) 行正在创建一个无法构造到数据帧中的递归列表。另外,我认为 data 列表应该是一个嵌套列表,其中每个子列表都是 parsed_feed 中的一个条目(以便数据框中的每一行都是一个条目)。

我会将循环写为

data = []                               # <---- initialize empty list here
for feed in feeds:    
    parsed_feed = feedparser.parse(feed['url'])
    print("Title:", feed['title'])
    print("Number of Articles:", len(parsed_feed.entries))
    print("\n")
    for entry in parsed_feed.entries:
        title = entry.title
        url = entry.link
        print(entry.summary)
        summary = entry.summary or "No summary available" # I simplified the ternary operators here
        date = entry.published or "No data available"     # I simplified the ternary operators here
        data.append([title, url, summary, date])          # <---- append data from each entry here

df = pd.DataFrame(data, columns = ['title', 'url', 'summary', 'date'])
from sqlalchemy import create_engine
import mysql.connector
engine = create_engine('mysql+pymysql://root:password_thingbob@localhost/somedatabase')  
df.to_sql('nationals', con = engine, if_exists = 'append', index = False)

我用您提供的提要列表检查了它,它工作正常。

关于python - 使用 pandas 写入 sql 数据库,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/75499220/

相关文章:

MySQL ORDER BY 聚合列返回不正确的结果

java - 在顶级关系数据库上运行ElasticSearch

python - 选择高于临界阈值的 pandas 数据框元素

python - pdb.set_trace() 是否总是覆盖错误回溯?

mysql - 为什么这个 MySQL 存储过程不能按我想要的方式工作?

mysql - 如何为 INSERT IGNORE 获取 mysql_errno()

mysql - 如何将重复数据值分组到我的脚本中

python - 在 Alembic 迁移中使用 SQLAlchemy ORM : how do I?

python - 如何将带有分隔符的数据框列转换为三列?

c# - 大量使用 SqlDataSource 和太多 3306 时间等待连接