我需要解析脚本标记中的一些数据。第一个挑战是页面上有几个没有 id 或类的脚本标签。我需要的看起来像这样:
<script>
window.runParams = {
data: {
"priceModule":{
"maxActivityAmount":{
"currency":"USD",
"formatedAmount":"US $28.71",
"value":28.71 ***VALUE TO IGNORE***
},
"maxAmount":{
"currency":"USD",
"formatedAmount":"US $52.20",
"value":52.2 ***VALUE TO IGNORE***
},
"minActivityAmount":{
"currency":"USD",
"formatedAmount":"US $6.83",
"value":6.83 ***THIS IS THE VALUE I NEED***
},
"minAmount":{
"currency":"USD",
"formatedAmount":"US $12.42",
"value":12.42 ***THIS IS THE VALUE I NEED***
},
},
"freightItemModule":{
"commitDay":"60",
"company":"Standard Shipping",
"currency":"USD",
"discount":100,
"displayType":"deliveryTime",
"features":{
},
"freightAmount":{
"currency":"USD",
"formatedAmount":"US $0.00",
"value":0.0 ***VALUE TO IGNORE***
},
"fullMailLine":false,
"hbaService":false,
"i18nMap":{
},
"id":0,
"name":"FreightItemModule",
"notification":"",
"sendGoodsCountry":"CN",
"sendGoodsCountryFullName":"China",
"serviceName":"CAINIAO_STANDARD",
"standardFreightAmount":{
"currency":"USD",
"formatedAmount":"US $13.12",
"value":13.12 ***VALUE TO IGNORE***
},
"time":"17-25",
"tracking":true
},
"skuModule":{
"categoryId":200001392,
"features":{
},"
forcePromiseWarrantyJson":"{
}",
"hasSizeInfo":false,
"hasSkuProperty":true,
"id":0,
"name":"SKUModule",
"productSKUPropertyList":[{
"isShowTypeColor":false,
"order":1,
"showType":"none",
"showTypeColor":false,
"skuPropertyId":14,
"skuPropertyName":"????",
"skuActivityAmount":{
"currency":"USD",
"formatedAmount":"US $12.38",
"value":12.38 ***VALUE TO IGNORE***
},
"skuAmount":{
"currency":"USD",
"formatedAmount":"US $22.51",
"value":22.51 ***VALUE TO IGNORE***
},
"skuCalPrice":"22.51",
"skuMultiCurrencyCalPrice":"22.51",
"skuMultiCurrencyDisplayPrice":"22.51"
}
},
},
};
var GaData = {
pageType: "product",
productIds: "32955439786",
totalValue: "US $6.83"
};
var PAGE_TIMING = {
pageType: 'gloDetail'
};
</script>
我需要解析 [data]
中的 [value]
-> [priceModule]
-> [minActivityAmount]
& [minAmount]
并将它们保存在两个单独的变量下:activity_amount = 6.83
和 amount = 12.42
。正如您所看到的,多个“模块”中有多个“值”。因此使用正则表达式解析它们似乎并不理想。也许有更好的方法如何从这个脚本中提取这些值?预先感谢您。
最佳答案
不幸的是BeautifulSoup
没有提供提取JS内容的工具。
解决这个问题的方法是使用正则表达式
import re
from bs4 import BeautifulSoup
data = """
<script>
window.runParams = {
data: {
"priceModule":{
"maxActivityAmount":{
"currency":"USD",
"formatedAmount":"US $28.71",
"value":28.71 ***VALUE TO IGNORE***
},
"maxAmount":{
"currency":"USD",
"formatedAmount":"US $52.20",
"value":52.2 ***VALUE TO IGNORE***
},
"minActivityAmount":{
"currency":"USD",
"formatedAmount":"US $6.83",
"value":6.83 ***THIS IS THE VALUE I NEED***
},
"minAmount":{
"currency":"USD",
"formatedAmount":"US $12.42",
"value":12.42 ***THIS IS THE VALUE I NEED***
},
},
"freightItemModule":{
"commitDay":"60",
"company":"Standard Shipping",
"currency":"USD",
"discount":100,
"displayType":"deliveryTime",
"features":{
},
"freightAmount":{
"currency":"USD",
"formatedAmount":"US $0.00",
"value":0.0 ***VALUE TO IGNORE***
},
"fullMailLine":false,
"hbaService":false,
"i18nMap":{
},
"id":0,
"name":"FreightItemModule",
"notification":"",
"sendGoodsCountry":"CN",
"sendGoodsCountryFullName":"China",
"serviceName":"CAINIAO_STANDARD",
"standardFreightAmount":{
"currency":"USD",
"formatedAmount":"US $13.12",
"value":13.12 ***VALUE TO IGNORE***
},
"time":"17-25",
"tracking":true
},
"skuModule":{
"categoryId":200001392,
"features":{
},"
forcePromiseWarrantyJson":"{
}",
"hasSizeInfo":false,
"hasSkuProperty":true,
"id":0,
"name":"SKUModule",
"productSKUPropertyList":[{
"isShowTypeColor":false,
"order":1,
"showType":"none",
"showTypeColor":false,
"skuPropertyId":14,
"skuPropertyName":"????",
"skuActivityAmount":{
"currency":"USD",
"formatedAmount":"US $12.38",
"value":12.38 ***VALUE TO IGNORE***
},
"skuAmount":{
"currency":"USD",
"formatedAmount":"US $22.51",
"value":22.51 ***VALUE TO IGNORE***
},
"skuCalPrice":"22.51",
"skuMultiCurrencyCalPrice":"22.51",
"skuMultiCurrencyDisplayPrice":"22.51"
}
},
},
};
var GaData = {
pageType: "product",
productIds: "32955439786",
totalValue: "US $6.83"
};
var PAGE_TIMING = {
pageType: 'gloDetail'
};
</script>
"""
soup = BeautifulSoup(data, features='html.parser')
script = soup.find('script')
values = []
keys = ['minActivityAmount', 'minAmount']
for key in keys:
value = re.search(r'(?<=\"%s\":{)([^]]+?)(?=\})' % key, script.text)
value = re.search(r'(?<="value":)([0-9.,]+)', value.group())
values.append(value.group())
print(values)
输出:
['6.83', '12.42']
关于django - 如何用 BeautifulSoup 解析脚本?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/56261512/