django - 如何用 BeautifulSoup 解析脚本?

标签 django python-3.x beautifulsoup

我需要解析脚本标记中的一些数据。第一个挑战是页面上有几个没有 id 或类的脚本标签。我需要的看起来像这样:

<script>
    window.runParams = {
        data: {
            "priceModule":{
                "maxActivityAmount":{
                    "currency":"USD",
                    "formatedAmount":"US $28.71",
                    "value":28.71 ***VALUE TO IGNORE***
                },
                "maxAmount":{
                    "currency":"USD",
                    "formatedAmount":"US $52.20",
                    "value":52.2 ***VALUE TO IGNORE***
                },
                "minActivityAmount":{
                    "currency":"USD",
                    "formatedAmount":"US $6.83",
                    "value":6.83 ***THIS IS THE VALUE I NEED***
                },
                "minAmount":{
                    "currency":"USD",
                    "formatedAmount":"US $12.42",
                    "value":12.42 ***THIS IS THE VALUE I NEED***
                },
            },
            "freightItemModule":{
                "commitDay":"60",
                "company":"Standard Shipping",
                "currency":"USD",
                "discount":100,
                "displayType":"deliveryTime",
                "features":{
                },
                "freightAmount":{
                    "currency":"USD",
                    "formatedAmount":"US $0.00",
                    "value":0.0 ***VALUE TO IGNORE***
                },
                "fullMailLine":false,
                "hbaService":false,
                "i18nMap":{
                },
                "id":0,
                "name":"FreightItemModule",
                "notification":"",
                "sendGoodsCountry":"CN",
                "sendGoodsCountryFullName":"China",
                "serviceName":"CAINIAO_STANDARD",
                "standardFreightAmount":{
                    "currency":"USD",
                    "formatedAmount":"US $13.12",
                    "value":13.12 ***VALUE TO IGNORE***
                },
                "time":"17-25",
                "tracking":true
            },
            "skuModule":{
                "categoryId":200001392,
                "features":{
                },"
                forcePromiseWarrantyJson":"{
                }",
                "hasSizeInfo":false,
                "hasSkuProperty":true,
                "id":0,
                "name":"SKUModule",
                "productSKUPropertyList":[{
                    "isShowTypeColor":false,
                    "order":1,
                    "showType":"none",
                    "showTypeColor":false,
                    "skuPropertyId":14,
                    "skuPropertyName":"????",
                    "skuActivityAmount":{
                        "currency":"USD",
                        "formatedAmount":"US $12.38",
                        "value":12.38 ***VALUE TO IGNORE***
                    },
                    "skuAmount":{
                        "currency":"USD",
                        "formatedAmount":"US $22.51",
                        "value":22.51 ***VALUE TO IGNORE***
                    },
                    "skuCalPrice":"22.51",
                    "skuMultiCurrencyCalPrice":"22.51",
                    "skuMultiCurrencyDisplayPrice":"22.51"
                }
            },
        },
    };

    var GaData = {
        pageType: "product",
        productIds: "32955439786",
        totalValue: "US $6.83"
    };

    var PAGE_TIMING = {
        pageType: 'gloDetail'
    };
</script>

我需要解析 [data] 中的 [value] -> [priceModule] -> [minActivityAmount] & [minAmount] 并将它们保存在两个单独的变量下:activity_amount = 6.83amount = 12.42。正如您所看到的,多个“模块”中有多个“值”。因此使用正则表达式解析它们似乎并不理想。也许有更好的方法如何从这个脚本中提取这些值?预先感谢您。

最佳答案

不幸的是BeautifulSoup没有提供提取JS内容的工具。

解决这个问题的方法是使用正则表达式

import re

from bs4 import BeautifulSoup

data = """
<script>
    window.runParams = {
        data: {
            "priceModule":{
                "maxActivityAmount":{
                    "currency":"USD",
                    "formatedAmount":"US $28.71",
                    "value":28.71 ***VALUE TO IGNORE***
                },
                "maxAmount":{
                    "currency":"USD",
                    "formatedAmount":"US $52.20",
                    "value":52.2 ***VALUE TO IGNORE***
                },
                "minActivityAmount":{
                    "currency":"USD",
                    "formatedAmount":"US $6.83",
                    "value":6.83 ***THIS IS THE VALUE I NEED***
                },
                "minAmount":{
                    "currency":"USD",
                    "formatedAmount":"US $12.42",
                    "value":12.42 ***THIS IS THE VALUE I NEED***
                },
            },
            "freightItemModule":{
                "commitDay":"60",
                "company":"Standard Shipping",
                "currency":"USD",
                "discount":100,
                "displayType":"deliveryTime",
                "features":{
                },
                "freightAmount":{
                    "currency":"USD",
                    "formatedAmount":"US $0.00",
                    "value":0.0 ***VALUE TO IGNORE***
                },
                "fullMailLine":false,
                "hbaService":false,
                "i18nMap":{
                },
                "id":0,
                "name":"FreightItemModule",
                "notification":"",
                "sendGoodsCountry":"CN",
                "sendGoodsCountryFullName":"China",
                "serviceName":"CAINIAO_STANDARD",
                "standardFreightAmount":{
                    "currency":"USD",
                    "formatedAmount":"US $13.12",
                    "value":13.12 ***VALUE TO IGNORE***
                },
                "time":"17-25",
                "tracking":true
            },
            "skuModule":{
                "categoryId":200001392,
                "features":{
                },"
                forcePromiseWarrantyJson":"{
                }",
                "hasSizeInfo":false,
                "hasSkuProperty":true,
                "id":0,
                "name":"SKUModule",
                "productSKUPropertyList":[{
                    "isShowTypeColor":false,
                    "order":1,
                    "showType":"none",
                    "showTypeColor":false,
                    "skuPropertyId":14,
                    "skuPropertyName":"????",
                    "skuActivityAmount":{
                        "currency":"USD",
                        "formatedAmount":"US $12.38",
                        "value":12.38 ***VALUE TO IGNORE***
                    },
                    "skuAmount":{
                        "currency":"USD",
                        "formatedAmount":"US $22.51",
                        "value":22.51 ***VALUE TO IGNORE***
                    },
                    "skuCalPrice":"22.51",
                    "skuMultiCurrencyCalPrice":"22.51",
                    "skuMultiCurrencyDisplayPrice":"22.51"
                }
            },
        },
    };

    var GaData = {
        pageType: "product",
        productIds: "32955439786",
        totalValue: "US $6.83"
    };

    var PAGE_TIMING = {
        pageType: 'gloDetail'
    };
</script>
"""

soup = BeautifulSoup(data, features='html.parser')
script = soup.find('script')

values = []
keys = ['minActivityAmount', 'minAmount']
for key in keys:
    value = re.search(r'(?<=\"%s\":{)([^]]+?)(?=\})' % key, script.text)
    value = re.search(r'(?<="value":)([0-9.,]+)', value.group())
    values.append(value.group())

print(values)

输出:

['6.83', '12.42']

关于django - 如何用 BeautifulSoup 解析脚本?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/56261512/

相关文章:

python - 项目中不同应用程序的 Django 文件上传?

django - django中的软删除对象

python - Chrome 从终端成功打开,但我收到 webdriver 常见异常消息

python - 你能在 python 中循环创建类吗?

python - Beautiful Soup 4 CSS 选择器的工作方式与教程显示的不同

python - 我可以在一个月中的某一天进行 order_by 吗?

python - 多对多关系 禁止直接分配到多对多集合的前向端

python - pygame 中的类出现问题

python - BS4 : Getting text in tag

python - 从 TD 中提取类别值 -