我正在尝试使用 pyodbc 将 pandas 数据帧插入 MS SQL Server。我之前使用过类似的方法进行直接插入,但是这次我尝试的解决方案非常慢。有没有比我拥有的更简化的方法来完成 upsert?
sql_connect = pyodbc.connect('Driver={SQL Server Native Client 11.0}; Server=blank1; Database=blank2; UID=blank3; PWD=blank4')
cursor = sql_connect.cursor()
for index, row in bdf.iterrows():
res = cursor.execute("UPDATE dbo.MPA_BOOK_RAW SET [SITE]=?, [SHIP_TO]=?, [PROD_LINE]=?, [GROUP_NUMBER]=?, [DESCRIPTION]=?, [ORDER_QTY]=?, [BPS_INCLUDE]=? WHERE [CUST]=? AND [ORDER_NUMBER]=? AND [ORDER_DATE]=? AND [PURCHASE_ORDER]=? AND [CHANNEL]=? AND [ITEM]=? AND [END_DT]=?",
row['SITE'],
row['SHIP_TO'],
row['PROD_LINE'],
row['GROUP_NUMBER'],
row['DESCRIPTION'],
row['ORDER_QTY'],
row['BPS_INCLUDE'],
row['CUST'],
row['ORDER_NUMBER'],
row['ORDER_DATE'],
row['PURCHASE_ORDER'],
row['CHANNEL'],
row['ITEM'],
row['END_DT'])
if res.rowcount == 0:
cursor.execute("INSERT INTO dbo.MPA_BOOK_RAW ([SITE], [CUST], [ORDER_NUMBER], [ORDER_DATE], [PURCHASE_ORDER], [CHANNEL], [SHIP_TO], [PROD_LINE], [GROUP_NUMBER], [DESCRIPTION], [ITEM], [ORDER_QTY], [END_DT], [BPS_INCLUDE]) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
row['SITE'],
row['CUST'],
row['ORDER_NUMBER'],
row['ORDER_DATE'],
row['PURCHASE_ORDER'],
row['CHANNEL'],
row['SHIP_TO'],
row['PROD_LINE'],
row['GROUP_NUMBER'],
row['DESCRIPTION'],
row['ITEM'],
row['ORDER_QTY'],
row['END_DT'],
row['BPS_INCLUDE'])
sql_connect.commit()
cursor.close()
sql_connect.close()
我用我原来的 ~50k 行数据帧的五行样本尝试了上述方法,它运行良好。所以逻辑似乎没问题。只是速度是个问题。
最佳答案
这是使用 MERGE 的“upsert”示例:
from pprint import pprint
import pandas as pd
import sqlalchemy as sa
connection_string = (
"Driver=ODBC Driver 17 for SQL Server;"
"Server=192.168.0.199;"
"UID=scott;PWD=tiger^5HHH;"
"DATABASE=test;"
"UseFMTONLY=Yes;"
)
sqlalchemy_url = sa.engine.URL.create(
"mssql+pyodbc", query={"odbc_connect": connection_string}
)
engine = sa.create_engine(sqlalchemy_url, fast_executemany=True)
with engine.begin() as conn:
# set up test environment
conn.exec_driver_sql("DROP TABLE IF EXISTS actual_table;")
conn.exec_driver_sql(
"""\
CREATE TABLE actual_table (
institution_no VARCHAR(3),
transit_no VARCHAR(5),
branch_name VARCHAR(50),
CONSTRAINT PK_actual_table PRIMARY KEY CLUSTERED
(institution_no, transit_no));
"""
)
# actual_table initial state
conn.exec_driver_sql(
"""\
INSERT INTO actual_table (institution_no, transit_no, branch_name) VALUES
('002', '45678', 'Scotiabank branch #45678 - *** UPDATE NEEDED ***'),
('003', '67890', 'RBC branch #67890 - Sudbury, ON');
"""
)
# test data to be updated or inserted
df_update = pd.DataFrame(
[
("004", "12345", "TD branch #12345 - London, ON"),
("002", "45678", "Scotiabank branch #45678 - Timmins, ON"),
("004", "34567", "TD branch #34567 - Toronto, ON"),
],
columns=["institution_no", "transit_no", "branch_name"],
)
# Here's where the real work begins ...
#
# Step 1: upload update data
df_update.to_sql("#update_table", conn, index=False)
#
# Step 2: perform the "upsert"
sql = """\
MERGE actual_table AS a
USING (SELECT institution_no, transit_no, branch_name FROM #update_table) as u
ON (a.institution_no = u.institution_no AND a.transit_no = u.transit_no)
WHEN MATCHED THEN
UPDATE SET branch_name = u.branch_name
WHEN NOT MATCHED THEN
INSERT (institution_no, transit_no, branch_name)
VALUES (u.institution_no, u.transit_no, u.branch_name);
"""
result = conn.exec_driver_sql(sql)
# verify results
with engine.begin() as conn:
pprint(conn.exec_driver_sql("SELECT * FROM actual_table").fetchall())
"""console output:
[('002', '45678', 'Scotiabank branch #45678 - Timmins, ON'),
('003', '67890', 'RBC branch #67890 - Sudbury, ON'),
('004', '12345', 'TD branch #12345 - London, ON'),
('004', '34567', 'TD branch #34567 - Toronto, ON')]
"""
关于python - 使用 pyodbc 将 pandas 数据帧高效插入到 MS SQL Server,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/54579287/