但我意识到,通过这种方式,我无法确定在哪个域
中看到了我希望在所需结果中看到的情况。
解决此类问题的最佳方法是什么?
最佳答案
您可以首先展平字典以获取与其关键路径和域关联的每个值。然后,使用关键路径和域,可以创建频率表,从中可以创建新的结构:
首先,一些将执行展平和重组的函数:
from collections import defaultdict
from itertools import product
data = [{'name': 'audi', 'date': 1230768000, 'type': 'automatic', 'fuel': ['Diesel'], 'color': 'silver', 'power': {'unit': 'kW', 'value': 176}, 'doors': 5, 'domain': 'google.com'}, {'name': 'audi', 'date': 1230768000, 'type': 'automatic', 'fuel': ['Diesel'], 'color': 'silver', 'doors': 4, 'domain': 'facebook'}, {'name': 'audi', 'date': 1230768000, 'type': 'automatic', 'fuel': ['Diesel'], 'color': 'grey', 'power': {'unit': 'kW', 'value': 200}, 'doors': 5, 'domain': 'facebook'}]
#get all the paths and domains
def get_paths(data, c = [], d = None):
if not isinstance(data, (dict, list)):
yield (tuple(c), (data, d))
elif isinstance(data, list):
yield from [i for j, k in enumerate(data) for i in get_paths(k, c = c+[j], d = d)]
else:
yield from [i for j, k in data.items() for i in get_paths(k, c=c+[j], d = d or data.get('domain'))]
#compute the domain frequencies for the paths
d1, d2 = defaultdict(list), defaultdict(dict)
for i in data:
for a, (val, domain) in get_paths(i):
if 'domain' not in a:
d2[tuple([*a, val])][domain] = d2[tuple([*a, val])].get(domain, 0)+1
d1[a].append(val)
#merge all the unique results
def to_dict(d):
_d = defaultdict(list)
for (a, *b), c in d:
_d[a].append((b, c))
if all(isinstance(i, int) for i in _d):
return [i for c in _d.values() for i in ([to_dict(c)] if all(k for k, _ in c) else [k for _, k in c])]
return {a:b[0][-1] if not b[0][0] else to_dict(b) for a, b in _d.items()}
#get the frequencies for the unique dict
def get_freq(d, c = []):
if isinstance(d, list):
if all(not isinstance(b, (dict, list)) for b in d):
return d2[(*c, 0, d[0])]
return [d2[(*c, i, a)] if not isinstance(a, dict) else get_freq(a, c+[i]) for i, a in enumerate(d)]
return {a:d2[(*c, a, b)] if not isinstance(b, (dict, list)) else get_freq(b, c+[a]) for a, b in d.items()}
#build repeating results
def get_rep(d, f = False):
_d = defaultdict(list)
for (a, *b), c in d:
_d[a].append((b, c))
if not f:
for a, b in _d.items():
if all(not j for j, _ in b):
yield from [{a:i} for _, k in b for i in set(k)]
else:
yield from [{a:i} for i in get_rep(b, True)]
else:
r = {a:set(b[0][-1]) for a, b in _d.items() if not b[0][0]}
for i in product(*r.values()):
flag = True
for a, b in _d.items():
if a not in r:
flag = False
for l in get_rep(b, True):
yield {**dict(zip(r.keys(), i)), **l}
if flag:
yield dict(zip(r.keys(), i))
#find all the values in a non unique block
def get_vals(d):
if not isinstance(d, (list, dict)):
yield d
else:
yield from [i for b in getattr(d, 'values', lambda :d)() for i in get_vals(b)]
#get frequencies for repeated items
def get_freq_rep(d):
r = {}
for a, b in d.items():
r[a] = [k for j, k in d2.items() if a in j and any(l in j for l in get_vals(b))][0]
return r
然后,将它们放在一起:
import json
u = [(a, b[0]) for a, b in d1.items() if len(set(b)) == 1 and not any(bool(set(j)&set(a)) and len(set(k)) != 1
for j, k in d1.items())]
u1 = [(a, b) for a, b in d1.items() if len(set(b)) > 1 or any(bool(set(j)&set(a)) and len(set(k)) > 1
for j, k in d1.items())]
result = {
'unique':{
'specs':(rd:=to_dict(u)),
'frequencies':get_freq(rd)
},
'repeating':[{'specs':i, 'frequencies':get_freq_rep(i)} for i in get_rep(u1)]
}
print(json.dumps(result, indent=4))
输出:
{
"unique": {
"specs": {
"name": "audi",
"date": 1230768000,
"type": "automatic",
"fuel": [
"Diesel"
]
},
"frequencies": {
"name": {
"google.com": 1,
"facebook": 2
},
"date": {
"google.com": 1,
"facebook": 2
},
"type": {
"google.com": 1,
"facebook": 2
},
"fuel": {
"google.com": 1,
"facebook": 2
}
}
},
"repeating": [
{
"specs": {
"color": "silver"
},
"frequencies": {
"color": {
"google.com": 1,
"facebook": 1
}
}
},
{
"specs": {
"color": "grey"
},
"frequencies": {
"color": {
"facebook": 1
}
}
},
{
"specs": {
"power": {
"unit": "kW",
"value": 176
}
},
"frequencies": {
"power": {
"google.com": 1,
"facebook": 1
}
}
},
{
"specs": {
"power": {
"unit": "kW",
"value": 200
}
},
"frequencies": {
"power": {
"google.com": 1,
"facebook": 1
}
}
},
{
"specs": {
"doors": 4
},
"frequencies": {
"doors": {
"facebook": 1
}
}
},
{
"specs": {
"doors": 5
},
"frequencies": {
"doors": {
"google.com": 1,
"facebook": 1
}
}
}
]
}
请注意,autoplius_lt
在上面的输出中不作为频率值存在,因为它未包含在您的第二个示例字典列表中。
编辑:要删除不需要的重复结果,您可以在传递给 get_rep
之前过滤 u1
的内容:
u1 = [(a, b) for a, b in u1 if a[0] not in ('spec_identification_manufacture_date','spec_powertrain_power')]
关于python - 分离 JSON 中的唯一/重复数据,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/66813908/