python - 数据框差异

标签 python pandas

所以我有下面的代码:

cols = ['Col001','Col002','Col003','Col004','Col005','Col006','Col007','Col008','Col009',]
import pandas as pd
dataA = [
    ['AB1', 'A',    100,    'NY',   0.01,   23, 'PQR',  1003,   0.002,],
    ['AB2', 'B',    201,    'NY',   0.03,   13, 'MNO',  1232,   0.004,],
    ['AB3', 'A',    234,    'NJ',   0.05,   54, 'ABC',  3443,   0.003,],
    ['AB4', 'V',    221,    'DE',   0.05,   67, 'ABC',  2345,   0.023,],
    ['AB5', 'B',    342,    'CT',   0.04,   89, 'MNO',  3457,   0.023,],
    ['AB6', 'N',    222,    'NY',   0.02,   67, 'PQR',  7665,   0.032,],
    ['AB7', 'F',    342,    'PA',   0.03,   56, 'ABC',  5767,   0.067,],
    ['AB8', 'C',    453,    'CA',   0.04,   34, 'PQR',  7563,   0.045,],
    ['AB9', 'B',    123,    'CT',   0.03,   65, 'PQR',  3465,   0.034,],
    ['AB10','C',    443,    'NJ',   0.03,   66, 'MNO',  3433,   0.087,],]
dataB = [
    ['AB1', 'A',    100,    'NY',   0.01,   23, 'PQR',  1003,   0.002,],
    ['AB2', 'B',    201,    'NY',   0.03,   13, 'MNO',  1232,   0.004,],
    ['AB3', 'A',    234,    'NJ',   0.05,   54, 'ABC',  3443,   0.003,],
    ['AB4', 'V',    221,    'DE',   0.08,   67, 'ABC',  2345,   0.023,],
    ['AB5', 'B',    342,    'NJ',   0.04,   89, 'MNO',  3457,   0.023,],
    ['AB6', 'N',    222,    'NY',   0.02,   67, 'PQR',  7665,   0.032,],
    ['AB7', 'F',    342,    'PA',   0.03,   56, 'MNO',  5767,   0.067,],
    ['AB8', 'C',    453,    'CA',   0.04,   34, 'PQR',  7563,   0.048,],
    ['AB9', 'B',    123,    'CT',   0.03,   65, 'PQR',  2353,   0.034,],
    ['AB10','C',    443,    'NJ',   0.03,   66, 'MNO',  3433,   0.087,],]

def getDataFrame(source,sourceName):
    df = pd.DataFrame(source,columns=cols)
    df['DataSource'] = sourceName
    return df

def compareDataFrames(sourceDataFrame,newDataFrame):
    targetDF = pd.concat([sourceDataFrame, newDataFrame])
    targetDF = targetDF.reset_index(drop=True)
    columnsGroup = list(targetDF.columns)
    columnsGroup.remove('DataSource')
    targetDF_GroupBy = targetDF.groupby(columnsGroup)
    idx = [x[0] for x in targetDF_GroupBy.groups.values() if len(x) == 1]
    targetDF = targetDF.reindex(idx)
    targetDF = targetDF.sort_values(by=['Col001'], ascending=[True])
    return targetDF

def getDiff():
    sourceData  = getDataFrame(dataA,'Legacy')
    newData     = getDataFrame(dataB,'New')
    comparedData = compareDataFrames(sourceData,newData)
    return comparedData

它的工作原理与预期完全一致。输出为

print (getDiff()[cols])
   Col001 Col002  Col003 Col004  Col005  Col006 Col007  Col008  Col009
3     AB4      V     221     DE    0.05      67    ABC    2345   0.023
13    AB4      V     221     DE    0.08      67    ABC    2345   0.023
4     AB5      B     342     CT    0.04      89    MNO    3457   0.023
14    AB5      B     342     NJ    0.04      89    MNO    3457   0.023
16    AB7      F     342     PA    0.03      56    MNO    5767   0.067
6     AB7      F     342     PA    0.03      56    ABC    5767   0.067
7     AB8      C     453     CA    0.04      34    PQR    7563   0.045
17    AB8      C     453     CA    0.04      34    PQR    7563   0.048
8     AB9      B     123     CT    0.03      65    PQR    3465   0.034
18    AB9      B     123     CT    0.03      65    PQR    2353   0.034
Press any key to continue . . .

到目前为止,一切都很好。但我不喜欢这个输出。即使它找到了差异,它也会显示每个差异行的所有列。

所以,我编写了另一种方法,可以只给出差异:

def createDiffDataFrame(diffDataframe,ignoreCols):
    diffData = []
    compareCols = diffDataframe.columns
    for eachContract in (diffDataframe.Col001.unique()):
        legacyRow = diffDataframe[(diffDataframe['DataSource']=='Legacy') & (diffDataframe['Col001']==eachContract)]
        newRow = diffDataframe[(diffDataframe['DataSource']=='New') & (diffDataframe['Col001']==eachContract)]
        if len(legacyRow) == 0:
            diffRow = [eachContract,'MISSING','MISSING','New']
        elif len(newRow) == 0:
            diffRow = [eachContract,'MISSING','Legacy','MISSING']
        else:
            for eachCol in compareCols:
                if eachCol not in ignoreCols and legacyRow.iloc[0][eachCol] != newRow.iloc[0][eachCol]:
                    diffRow = [eachContract,eachCol,legacyRow.iloc[0][eachCol],newRow.iloc[0][eachCol]]
                    diffData.append(diffRow)
    diffDF = pd.DataFrame(diffData,columns=['Col001','ColumnName','LegacyValue','NewValue'])
    diffDF = diffDF.set_index('Col001')
    return diffDF

现在,我的输出是完美的:

x = getDiff()
print (createDiffDataFrame(x,['DataSource']))
       ColumnName LegacyValue NewValue
Col001
AB4        Col005        0.05     0.08
AB5        Col004          CT       NJ
AB7        Col007         ABC      MNO
AB8        Col009       0.045    0.048
AB9        Col008        3465     2353
Press any key to continue . . .

我的问题是: 尽管我得到了预期的输出,但在具有 114 列和超过 50K 行的现实世界中,运行 createDiffDataFrame() 模块需要很长时间。有没有更好的方法来实现 createDiffDataFrame?是否应该使用 pandas 来实现此目的?

最佳答案

为什么不尝试仅使用pandas

dataA['New']=dataA.apply(lambda x :','.join(x.astype(str)),axis=1)
dataB['New']=dataB.apply(lambda x :','.join(x.astype(str)),axis=1)
DF=pd.concat([dataA,dataB],axis=0)
DF['DIFF']=DF.groupby('New')['New'].transform('count')
DF=DF[DF.DIFF==1]
DF.sort_values('Col001')


Out[1927]: 
  Col001 Col002  Col003 Col004  Col005  Col006 Col007  Col008  Col009                                                New  DIFF
3    AB4      V     221     DE    0.05      67    ABC    2345   0.023  AB4,V,221,DE,0.05,67,ABC,2345,0.023,AB4,V,221,...     1
3    AB4      V     221     DE    0.08      67    ABC    2345   0.023  AB4,V,221,DE,0.08,67,ABC,2345,0.023,AB4,V,221,...     1
4    AB5      B     342     CT    0.04      89    MNO    3457   0.023  AB5,B,342,CT,0.04,89,MNO,3457,0.023,AB5,B,342,...     1
4    AB5      B     342     NJ    0.04      89    MNO    3457   0.023  AB5,B,342,NJ,0.04,89,MNO,3457,0.023,AB5,B,342,...     1
6    AB7      F     342     PA    0.03      56    ABC    5767   0.067  AB7,F,342,PA,0.03,56,ABC,5767,0.067,AB7,F,342,...     1
6    AB7      F     342     PA    0.03      56    MNO    5767   0.067  AB7,F,342,PA,0.03,56,MNO,5767,0.067,AB7,F,342,...     1
7    AB8      C     453     CA    0.04      34    PQR    7563   0.045  AB8,C,453,CA,0.04,34,PQR,7563,0.045,AB8,C,453,...     1
7    AB8      C     453     CA    0.04      34    PQR    7563   0.048  AB8,C,453,CA,0.04,34,PQR,7563,0.048,AB8,C,453,...     1
8    AB9      B     123     CT    0.03      65    PQR    3465   0.034  AB9,B,123,CT,0.03,65,PQR,3465,0.034,AB9,B,123,...     1
8    AB9      B     123     CT    0.03      65    PQR    2353   0.034  AB9,B,123,CT,0.03,65,PQR,2353,0.034,AB9,B,123,...     1

为您第二次输出(通过使用之前的结果)

DF1=DF.groupby('Col001').agg(lambda x:sorted(set(x), key=list(x).index)).reset_index()
DF1=DF1.set_index('Col001')
DF2=DF1.ix[:,2:10].reset_index()
DF3=pd.melt(DF2,id_vars=['Col001'])
DF3['select']=DF3.value.apply(lambda x : len(x))
DF4=DF3.loc[(DF3['select']>1)]



pd.concat([DF4.reset_index(drop=True),pd.DataFrame(DF4.value.values.tolist())],axis=1).\
    rename(columns={0:'LegacyValue',1:'NewValue'}).drop(['value','select'],axis=1)
Out[1909]: 
  Col001 variable LegacyValue NewValue
0    AB5   Col004          CT       NJ
1    AB4   Col005        0.05     0.08
2    AB7   Col007         ABC      MNO
3    AB9   Col008        3465     2353
4    AB8   Col009       0.045    0.048

更新了不要使用.ix使用.loc

DF1=DF1.set_index('Col001')
cols = ['Col001','Col002','Col003','Col004','Col005','Col006','Col007','Col008','Col009',]
DF2=DF1.loc[:,DF1.columns.isin(cols)].reset_index()
DF3=pd.melt(DF2,id_vars=['Col001'])
DF3['select']=DF3.value.apply(lambda x : len(x))
DF4=DF3.loc[(DF3['select']>1)]
pd.concat([DF4.reset_index(drop=True),pd.DataFrame(DF4.value.values.tolist())],axis=1).\
    rename(columns={0:'LegacyValue',1:'NewValue'}).drop(['value','select'],axis=1)

关于python - 数据框差异,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/45822572/

相关文章:

python - Pandas DataFrame 似乎没有 "factorize"方法

python - 返回 "self"?它到底做了什么,我们什么时候需要返回 self

python - 有没有办法获得多条曲线与另一条曲线的所有交点?

python - 遍历 df 的所有值以替换零值

python - 如何将 pandas 中的 timedelta 列转换为字符串

python - 获取 Pandas 数据框中的最大连续空行

java - 如何使用 Eclipse 使用 Python 导入 Java 类?

python - 如何从训练、测试和验证文件夹中的多个文件夹中拆分和复制文件

Python - 额外的关键字(?)和继承

python - 如何按定义元素的长度将包含 NaN 的 Series 分成两部分?