python - 分离 JSON 中的唯一/重复数据

但我意识到，通过这种方式，我无法确定在哪个域中看到了我希望在所需结果中看到的情况。

解决此类问题的最佳方法是什么？

最佳答案

您可以首先展平字典以获取与其关键路径和域关联的每个值。然后，使用关键路径和域，可以创建频率表，从中可以创建新的结构:

首先，一些将执行展平和重组的函数:

from collections import defaultdict
from itertools import product
data = [{'name': 'audi', 'date': 1230768000, 'type': 'automatic', 'fuel': ['Diesel'], 'color': 'silver', 'power': {'unit': 'kW', 'value': 176}, 'doors': 5, 'domain': 'google.com'}, {'name': 'audi', 'date': 1230768000, 'type': 'automatic', 'fuel': ['Diesel'], 'color': 'silver', 'doors': 4, 'domain': 'facebook'}, {'name': 'audi', 'date': 1230768000, 'type': 'automatic', 'fuel': ['Diesel'], 'color': 'grey', 'power': {'unit': 'kW', 'value': 200}, 'doors': 5, 'domain': 'facebook'}]
#get all the paths and domains
def get_paths(data, c = [], d = None):
   if not isinstance(data, (dict, list)):
      yield (tuple(c), (data, d))
   elif isinstance(data, list):
      yield from [i for j, k in enumerate(data) for i in get_paths(k, c = c+[j], d = d)]
   else:
      yield from [i for j, k in data.items() for i in get_paths(k, c=c+[j], d = d or data.get('domain'))]

#compute the domain frequencies for the paths
d1, d2 = defaultdict(list), defaultdict(dict)
for i in data:
   for a, (val, domain) in get_paths(i):
     if 'domain' not in a:
        d2[tuple([*a, val])][domain] = d2[tuple([*a, val])].get(domain, 0)+1
        d1[a].append(val)

#merge all the unique results
def to_dict(d):
   _d = defaultdict(list)
   for (a, *b), c in d:
      _d[a].append((b, c))
   if all(isinstance(i, int) for i in _d):
      return [i for c in _d.values() for i in ([to_dict(c)] if all(k for k, _ in c) else [k for _, k in c])]
   return {a:b[0][-1] if not b[0][0] else to_dict(b) for a, b in _d.items()}

#get the frequencies for the unique dict
def get_freq(d, c = []):
  if isinstance(d, list):
     if all(not isinstance(b, (dict, list)) for b in d):
        return d2[(*c, 0, d[0])]
     return [d2[(*c, i, a)] if not isinstance(a, dict) else get_freq(a, c+[i]) for i, a in enumerate(d)]
  return {a:d2[(*c, a, b)] if not isinstance(b, (dict, list)) else get_freq(b, c+[a]) for a, b in d.items()}

#build repeating results
def get_rep(d, f = False):
  _d = defaultdict(list)
  for (a, *b), c in d:
      _d[a].append((b, c))
  if not f:
     for a, b in _d.items():
        if all(not j for j, _ in b):
            yield from [{a:i} for _, k in b for i in set(k)]
        else:
            yield from [{a:i} for i in get_rep(b, True)]
  else:
     r = {a:set(b[0][-1]) for a, b in _d.items() if not b[0][0]}
     for i in product(*r.values()):
        flag = True
        for a, b in _d.items():
           if a not in r:
              flag = False
              for l in get_rep(b, True):
                 yield {**dict(zip(r.keys(), i)), **l}
        if flag:
           yield dict(zip(r.keys(), i))


#find all the values in a non unique block
def get_vals(d):
   if not isinstance(d, (list, dict)):
      yield d
   else:
      yield from [i for b in getattr(d, 'values', lambda :d)() for i in get_vals(b)]
   
#get frequencies for repeated items
def get_freq_rep(d):
   r = {}
   for a, b in d.items():
      r[a] = [k for j, k in d2.items() if a in j and any(l in j for l in get_vals(b))][0]
   return r

然后，将它们放在一起:

import json
u = [(a, b[0]) for a, b in d1.items() if len(set(b)) == 1 and not any(bool(set(j)&set(a)) and len(set(k)) != 1 
     for j, k in d1.items())]
u1 = [(a, b) for a, b in d1.items() if len(set(b)) > 1 or any(bool(set(j)&set(a)) and len(set(k)) > 1 
     for j, k in d1.items())]
result = {
          'unique':{
               'specs':(rd:=to_dict(u)), 
               'frequencies':get_freq(rd)
           },
           'repeating':[{'specs':i, 'frequencies':get_freq_rep(i)} for i in get_rep(u1)]
         }
print(json.dumps(result, indent=4))

输出:

{
   "unique": {
      "specs": {
         "name": "audi",
        "date": 1230768000,
        "type": "automatic",
        "fuel": [
            "Diesel"
        ]
    },
    "frequencies": {
        "name": {
            "google.com": 1,
            "facebook": 2
        },
        "date": {
            "google.com": 1,
            "facebook": 2
        },
        "type": {
            "google.com": 1,
            "facebook": 2
        },
        "fuel": {
            "google.com": 1,
            "facebook": 2
        }
    }
},
"repeating": [
    {
        "specs": {
            "color": "silver"
        },
        "frequencies": {
            "color": {
                "google.com": 1,
                "facebook": 1
            }
        }
    },
    {
        "specs": {
            "color": "grey"
        },
        "frequencies": {
            "color": {
                "facebook": 1
            }
        }
    },
    {
        "specs": {
            "power": {
                "unit": "kW",
                "value": 176
            }
        },
        "frequencies": {
            "power": {
                "google.com": 1,
                "facebook": 1
            }
        }
    },
    {
        "specs": {
            "power": {
                "unit": "kW",
                "value": 200
            }
        },
        "frequencies": {
            "power": {
                "google.com": 1,
                "facebook": 1
            }
        }
    },
    {
        "specs": {
            "doors": 4
        },
        "frequencies": {
            "doors": {
                "facebook": 1
            }
        }
    },
    {
        "specs": {
            "doors": 5
        },
        "frequencies": {
            "doors": {
                "google.com": 1,
                "facebook": 1
              }
          }
      }
   ]
}

请注意，autoplius_lt 在上面的输出中不作为频率值存在，因为它未包含在您的第二个示例字典列表中。

编辑:要删除不需要的重复结果，您可以在传递给 get_rep 之前过滤 u1 的内容:

u1 = [(a, b) for a, b in u1 if a[0] not in ('spec_identification_manufacture_date','spec_powertrain_power')]

关于python - 分离 JSON 中的唯一/重复数据，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/66813908/

python - 分离 JSON 中的唯一/重复数据

上一篇：r - Microsoft Graph API - 错误 403 "Insufficient privileges to complete the operation"

下一篇：amazon-web-services - AWS SQS触发lambda突然停止并且不删除消息