javascript - 获取脚本标签内的内容

标签 javascript scrapy web-crawler scrapy-splash splash-js-render

大家好,我正在尝试获取脚本标签内的内容。

http://www.teknosa.com/urunler/145051447/samsung-hm1500-bluetooth-kulaklik

这是网站。

这也是我想在里面输入的脚本标签。

$.Teknosa.ProductDetail = {"ProductComputedIndex":145051447,"ProductName":"SAMSUNG HM1500 BLUETOOTH KULAKLIK","ProductSeoName":"samsung-hm1500-bluetooth-kulaklik","ProductBarcode":"8808993790425","ProductPriceInclTax":79.9,"ProductDiscountedPriceInclTax":null,"ProductStockQuantity":1,"ProductMinStockQuantity":null,"ProductShortDescription":null,"ProductFullDescription":null,"ProductModelName":"HM1500","ProductAdminComment":null,"ProductMetaTitle":null,"ProductMetaKeywords":null,"ProductMetaDescription":null,"ProductBrandId":299,"ProductBrandName":"SAMSUNG","ProductBrandImageName":"//img-teknosa.mncdn.com/StaticContent/images/Brand/SAMSUNG-medium.png","ProductCommentCout":29,"ProductQuestionAnswerCout":0,"ProductRatingStar":4,"ProductType":1,"ProductOriginalComputedIndex":null,"ProductIsSolo":false,"ProductIsClickCollect":true,"ProductStoreStockAmount":1,"ProductGroupDisplayName":null,"ProductOrigin":"PRC","ProductIsTss":false,"ProductIsKit":false,"AddBasketButtonType":0,"ProductViewType":0,"ProductDetailDefaultPicture":"145051447-1-samsung-hm1500-bluetooth-kulaklik.jpg","ProductRatingStarText":"Çok İyi","ProductPrice":"79,9","IsThereOutletProduct":false,"ProductIsActiveProductOriginal":false,"ProductErpCatalogCode":"_TELEKOM","ProductErpCategoryCode":"_BLUETOOTH_KULAKLIKLAR1636","ProductCategory":{"CategoryName":"Bluetooth Kulaklık ve Kit","CategorySeoName":"bluetooth-kulaklik-ve-kit","CategoryDescription":null,"CategoryParentId":134,"CategoryLevel":2,"CategoryMetaTitle":null,"CategoryMetaKeywords":null,"CategoryMetaDescription":null,"Parent":{"CategoryName":"Telefon Aksesuarları","CategorySeoName":"telefon-aksesuarlari","CategoryDescription":null,"CategoryParentId":108,"CategoryLevel":1,"CategoryMetaTitle":null,"CategoryMetaKeywords":null,"CategoryMetaDescription":null,"Parent":{"CategoryName":"Telefon","CategorySeoName":"telefon","CategoryDescription":null,"CategoryParentId":null,"CategoryLevel":0,"CategoryMetaTitle":null,"CategoryMetaKeywords":null,"CategoryMetaDescription":null,"Parent":null,"DisplayOrder":6,"StatusId":100110,"StartDate":"\/Date(1434351061000)\/","EndDate":null,"Id":108},"DisplayOrder":3,"StatusId":100110,"StartDate":"\/Date(1434351245000)\/","EndDate":null,"Id":134},"DisplayOrder":3,"StatusId":100110,"StartDate":"\/Date(1434351367000)\/","EndDate":null,"Id":173},"ProductDetailPictures":[{"ProductPictureName":"145051447-1-samsung-hm1500-bluetooth-kulaklik.jpg","ProductPictureOrder":1,"ProductPictureIsDefault":true},{"ProductPictureName":"145051447-2-samsung-hm1500-bluetooth-kulaklik.jpg","ProductPictureOrder":2,"ProductPictureIsDefault":false}],"ProductDetailAttributes":[{"Key":"Ağırlık","Value":"18.1","UnitItemName":"gr","ProductAttributeDisplayOrder":0,"DisplayOrder":2,"Description":null},{"Key":"Model","Value":"HM1500","UnitItemName":null,"ProductAttributeDisplayOrder":0,"DisplayOrder":4,"Description":null},{"Key":"Şarj Kullanım Süresi","Value":"2 Saat","UnitItemName":null,"ProductAttributeDisplayOrder":0,"DisplayOrder":80,"Description":null},{"Key":"Bekleme Süresi (Saat)","Value":"250 Saat (Maks.)","UnitItemName":null,"ProductAttributeDisplayOrder":0,"DisplayOrder":116,"Description":null},{"Key":"Kullanım Mesafesi","Value":"10 m. (Maks.)","UnitItemName":null,"ProductAttributeDisplayOrder":0,"DisplayOrder":145,"Description":null},{"Key":"Bluetooth Profili","Value":"HSP (Kulaklık), HFP (Ahizesiz)","UnitItemName":null,"ProductAttributeDisplayOrder":0,"DisplayOrder":149,"Description":null}],"ProductSuggestions":[],"ProductContents":[],"ProductKitItems":[],"ProductVideos":[],"ProductGroups":[],"ProductBadges":[{"BadgeItemBadgeId":7,"BadgeItemApplicationId":1,"BadgeItemText":null,"BadgeItemImageName":"//img-teknosa.mncdn.com/StaticContent/images/Badge/ucretsiz-kargo.png","BadgeItemDescription":null,"BadgeItemPagePosition":"ImageBottom","BadgeItemImagePosition":null,"BadgeItemDisplayView":"ProductDetail","BadgeItemType":"Image","BadgeItemDynamicType":"WebStock","BadgeItemDynamicTypeText1":null,"BadgeItemDynamicTypeText2":null,"BadgeItemDynamicTypeCalculationType":null,"BadgeItemDynamicTypeDisplayType":null,"BadgeItemEvaluationExpression":null,"BadgeItemClassName":null,"DisplayOrder":0,"StatusId":100110,"StartDate":"\/Date(1474440397000)\/","EndDate":null,"Id":5}],"DisplayOrder":1000,"StatusId":100110,"StartDate":"\/Date(1429000863000)\/","EndDate":null,"Id":4715};

我试过了。

yield scrapy.Request(response.urljoin(url), callback = self.parseProduct, meta={
                                'splash': {
                                 'endpoint': 'render.html',
                                 'args': {'wait': 0.09}},
                                'url': url
                            })
 def parseProduct(self, response):
    data_bundles = {}
            script = response.xpath('/html/body/div[1]/div[2]/script[2]/text()').extract_first()
            print script
            jstree = js2xml.parse(script)
            for a in jstree.xpath('//assign[left//property/identifier/@name="$.Teknosa.ProductDetail" and right/object]'):
                bundle_prop = a.xpath('./left/bracketaccessor/property/string/text()')
                print bundle_prop
                if bundle_prop is not None:
                    curr_prop = bundle_prop[0]
                data_bundles[curr_prop] = {}

感谢您的帮助。

最佳答案

应该这样做:

response.xpath("//script[re:test(text(),'Teknosa.ProductDetail =','i')]").extract()

您可以选择文本中包含“Teknosa.ProductDetails ="的 script 标记。

编辑: 如果你想从脚本中加载 javascript 字典,你需要从脚本中提取文本,你可以简单地使用 python 的 json 模块加载它。

xp = "//script[re:test(text(),'Teknosa.ProductDetail =','i')]/text()"
data = response.xpath(xp).re(" = (\{.+\})")[0]
import json
data = json.loads(data)
print(data['ProductBarcode'])
> '8808993790425'

关于javascript - 获取脚本标签内的内容,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/41333902/

相关文章:

seo - 如何允许爬虫只访问 index.php,使用 robots.txt?

javascript - 文件未定义 Node js

python - python的 Mechanize 可以提取与控件关联的文本吗?

javascript - Ember 原始 JSON 转换无法正常工作?

python - Scrapy 表示没有抓取任何页面/项目?

python - 将多个 *.ts 文件(字节或类字节格式)连接到一个 mp4 文件

python - 停止 Scrapy 抓取相同的 URL

python - Scrapy 没有按照 allowed_domains 过滤结果

javascript - 如何使用在点击函数内声明的变量

javascript - 使用 jQuery Mobile 捕获 'pageshow' 上的经纬度坐标