ruby-on-rails - 如何使用 JSON 的 Nokogiri 中的 XPath 从 eBay 和亚马逊抓取图像

标签 ruby-on-rails ruby xpath web-scraping nokogiri

我正在尝试使用 Nokogiri 和 XPath 从网站上抓取图像,但到目前为止收效甚微。对于其 HTML 具有 imgsrc 的典型网站,我可以使用:

tmp2 = Nokogiri::HTML(open(site_url))
tmp2.xpath("//img/@src").each do |src|
  ...do whatever
end

但是,某些网站(如 Amazon 和 eBay)仅使用 JavaScript 触发特定图像。如果我查看代码,我可以看到数组中的数据。例如,来自 Amazon :

    <script type="text/javascript">
P.when('jQuery', 'cf').execute(function($, cf){
  P.load.js('http://z-ecx.images-amazon.com/images/G/01/browser-scripts/imageBlock-udp-airy/imageBlock-udp-airy-4060168860._V1_.js');
});

P.when('A', 'jQuery', 'ImageBlockATF', 'cf').register('ImageBlockBTF', function(A, $, imageBlockATF, cf){
  var data = {"indexToColor":[],"burjImageBlock":0,"isSwatchHoverConsistent":1,"heroFocalPoint":null,"visualDimensions":["color_name"],"productGroupID":"apparel_display_on_website","newVideoMissing":0,"useIV":0,"useClickZoom":null,"useChildVideos":0,"numColors":7,"logMetrics":0,"defaultColor":"initial","airyConfig":{"enableContinuousPlay":null,"installFlashButtonText":"Install Flash Player","contentTitle":null,"autoplayCutOffTimeSeconds":null,"ageGate":{"monthNames":["January","February","March","April","May","June","July","August","September","October","November","December"],"deniedPrompt":"We're sorry. You are not old enough to watch this video.","submitText":"Submit","prompt":"This video is not intended for all audiences. What date were you born?"},"videoAds":null,"videoUnsupportedPrompt":"Sorry, this video is unsupported on this browser.","desiredMode":null,"swfUrl":"http://g-ecx.images-amazon.com/images/G/01/vap/video/airy2/prod/2.0.1102.0/flash/AiryBasicRenderer._V304902271_.swf","isAutoplayEnabled":null,"installFlashPrompt":"Adobe Flash Player is required to watch this video.","isLiveStream":null,"regionCode":"NA","contentId":null,"playbackErrorPrompt":"Sorry, an error has occurred while attempting video playback. Please try again later.","contentMinAge":null,"isForesterTrackingDisabled":null,"streamingUrls":null,"parentId":null,"foresterMetadataParams":{"client":"Dpx","requestId":"1MX7VHFRVAS6TWY64BXC","marketplaceId":"ATVPDKIKX0DER","session":"182-9511970-7757812","method":"Apparel.ImageBlock"},"jsUrl":"http://z-ecx.images-amazon.com/images/G/01/vap/video/airy2/prod/2.0.1102.0/js/airy.chromeless._V304902265_.js"},"mainImageMaxSizes":null,"staticStrings":{"playVideo":"Click to play video","rollOverToZoom":"Roll over image to zoom in","images":"Images","video":"video","clickToZoom":"Click on image to zoom in","touchToZoom":"Touch the image to zoom in","videos":"Videos","close":"Close","pleaseSelect":"Please select","clickToExpand":"Click to open expanded view","allMedia":"All Media"},"notThumbnailClickImmersiveView":1,"gIsNewTwister":1,"title":"Threads 4 Thought Women's Tabitha Basic Tank Top","ivRepresentativeAsin":{"6":"B00T46V76W","4":"B00WM3O7ES","1":"B00T46YZES","3":"B00WM3NLPE","2":"B00T46VD16","5":"B00T46VGXQ"},"mainImageSizes":[[342,445],[385,500],[425,550],[466,606],[522,679]],"isQuickview":0,"ipadVideoSizes":[[340,444],[384,500]],"colorToAsin":{"Coral Dreams":{"asin":"B00T46V76W"},"Heather Grey":{"asin":"B00WM3NLPE"},"Black":{"asin":"B00T46YZES"},"White":{"asin":"B00T46VGXQ"},"Deep Blue Sea":{"asin":"B00T46VD16"},"Sea Glass":{"asin":"B00WM3O7ES"}},"thumbExperimentEnabledValue":1,"showLITBOnClick":0,"videoSizes":[[342,445],[384,500]],"stretchyGoodnessWidth":[1280,1440,1640,1800],"autoplayVideo":0,"hoverZoomIndicator":"","sitbReftag":"","useHoverZoom":1,"staticImages":{"zoomOut":"http://g-ecx.images-amazon.com/images/G/01/detail-page/cursors/zoom-out._V184888738_.bmp","hoverZoomIcon":"http://g-ecx.images-amazon.com/images/G/01/img11/apparel/UX/DP/icon_zoom._V138923886_.png","zoomIn":"http://g-ecx.images-amazon.com/images/G/01/detail-page/cursors/zoom-in._V184888790_.bmp","zoomLensBackground":"http://g-ecx.images-amazon.com/images/G/01/apparel/rcxgs/tile._V211431200_.gif","videoThumbIcon":"http://g-ecx.images-amazon.com/images/G/01/Quarterdeck/en_US/images/video._V183716339_SX38_SY50_CR,0,0,38,50_.gif","spinner":"http://g-ecx.images-amazon.com/images/G/01/ui/loadIndicators/loading-large_labeled._V192238949_.gif","zoomInCur":"http://g-ecx.images-amazon.com/images/G/01/detail-page/cursors/zoomIn._V323082799_.cur","videoSWFPath":"http://g-ecx.images-amazon.com/images/G/01/Quarterdeck/en_US/video/20110518115040892/Video._V178668404_.swf","arrow":"http://g-ecx.images-amazon.com/images/G/01/javascripts/lib/popover/images/light/sprite-vertical-popover-arrow._V186877868_.png","zoomOutCur":"http://g-ecx.images-amazon.com/images/G/01/detail-page/cursors/zoomOut._V323082798_.cur"},"videos":[],"gPreferChildVideos":0,"altsOnLeft":1,"ivImageSetKeys":{"Coral Dreams":"6","Heather Grey":"3","Black":"1","initial":0,"White":"5","Deep Blue Sea":"2","Sea Glass":"4"},"useHoverZoomIpad":"","isUDP":1,"alwaysIncludeVideo":0,"widths":[1280,1440,1640,1800],"maxAlts":7,"useChromelessVideoPlayer":1,"mainImageHeightPartitions":null};
  data["customerImages"] = eval('[]');
  data["colorImages"] = {"Coral Dreams":[{"large":"http://ecx.images-amazon.com/images/I/41FGlhksmtL.jpg","variant":"MAIN","hiRes":"http://ecx.images-amazon.com/images/I/81iXQbkcpiL._UL1500_.jpg","thumb":"http://ecx.images-amazon.com/images/I/41FGlhksmtL._SR38,50_.jpg","main":{"http://ecx.images-amazon.com/images/I/81iXQbkcpiL._UX466_.jpg":["466","606"],"http://ecx.images-amazon.com/images/I/81iXQbkcpiL._UX522_.jpg":["522","679"],"http://ecx.images-amazon.com/images/I/81iXQbkcpiL._UY550_.jpg":["423","550"],"http://ecx.images-amazon.com/images/I/81iXQbkcpiL._UX342_.jpg":["342","445"],"http://ecx.images-amazon.com/images/I/81iXQbkcpiL._UY500_.jpg":["385","500"]}},{"large":"http://ecx.images-amazon.com/images/I/41XR9o0cV-L.jpg","variant":"BACK","hiRes":"http://ecx.images-amazon.com/images/I/81bVmFiRu0L._UL1500_.jpg","thumb":"http://ecx.images-amazon.com/images/I/41XR9o0cV-L._SR38,50_.jpg","main":{"http://ecx.images-amazon.com/images/I/81bVmFiRu0L._UY500_.jpg":["385","500"],"http://ecx.images-amazon.com/images/I/81bVmFiRu0L._UX522_.jpg":["522","679"],"http://ecx.images-amazon.com/images/I/81bVmFiRu0L._UX342_.jpg":["342","445"],"http://ecx.images-amazon.com/images/I/81bVmFiRu0L._UX466_.jpg":["466","606"],"http://ecx.images-amazon.com/images/I/81bVmFiRu0L._UY550_.jpg":["423","550"]}}],"Heather Grey":[{"large":"http://ecx.images-amazon.com/images/I/41f-8R8Eu-L.jpg","variant":"MAIN","hiRes":"http://ecx.images-amazon.com/images/I/81dTYkBL%2BxL._UL1500_.jpg","thumb":"http://ecx.images-amazon.com/images/I/41f-8R8Eu-L._SR38,50_.jpg","main":{"http://ecx.images-amazon.com/images/I/81dTYkBL%2BxL._UX466_.jpg":["466","606"],"http://ecx.images-amazon.com/images/I/81dTYkBL%2BxL._UY500_.jpg":["385","500"],"http://ecx.images-amazon.com/images/I/81dTYkBL%2BxL._UY550_.jpg":["423","550"],"http://ecx.images-amazon.com/images/I/81dTYkBL%2BxL._UX522_.jpg":["522","679"],"http://ecx.images-amazon.com/images/I/81dTYkBL%2BxL._UX342_.jpg":["342","445"]}},{"large":"http://ecx.images-amazon.com/images/I/41gLiFBbcdL.jpg","variant":"BACK","hiRes":"http://ecx.images-amazon.com/images/I/81ua3AXCpJL._UL1500_.jpg","thumb":"http://ecx.images-amazon.com/images/I/41gLiFBbcdL._SR38,50_.jpg","main":{"http://ecx.images-amazon.com/images/I/81ua3AXCpJL._UX342_.jpg":["342","445"],"http://ecx.images-amazon.com/images/I/81ua3AXCpJL._UY550_.jpg":["423","550"],"http://ecx.images-amazon.com/images/I/81ua3AXCpJL._UY500_.jpg":["385","500"],"http://ecx.images-amazon.com/images/I/81ua3AXCpJL._UX522_.jpg":["522","679"],"http://ecx.images-amazon.com/images/I/81ua3AXCpJL._UX466_.jpg":["466","606"]}}],"Black":[{"large":"http://ecx.images-amazon.com/images/I/41BxSpfEM7L.jpg","variant":"MAIN","hiRes":"http://ecx.images-amazon.com/images/I/81%2BTW8762BL._UL1500_.jpg","thumb":"http://ecx.images-amazon.com/images/I/41BxSpfEM7L._SR38,50_.jpg","main":{"http://ecx.images-amazon.com/images/I/81%2BTW8762BL._UY550_.jpg":["423","550"],"http://ecx.images-amazon.com/images/I/81%2BTW8762BL._UX342_.jpg":["342","445"],"http://ecx.images-amazon.com/images/I/81%2BTW8762BL._UX522_.jpg":["522","679"],"http://ecx.images-amazon.com/images/I/81%2BTW8762BL._UY500_.jpg":["385","500"],"http://ecx.images-amazon.com/images/I/81%2BTW8762BL._UX466_.jpg":["466","606"]}},{"large":"http://ecx.images-amazon.com/images/I/41Gf%2BW-cPTL.jpg","variant":"BACK","hiRes":"http://ecx.images-amazon.com/images/I/81SJwuaCspL._UL1500_.jpg","thumb":"http://ecx.images-amazon.com/images/I/41Gf%2BW-cPTL._SR38,50_.jpg","main":{"http://ecx.images-amazon.com/images/I/81SJwuaCspL._UY500_.jpg":["385","500"],"http://ecx.images-amazon.com/images/I/81SJwuaCspL._UX522_.jpg":["522","679"],"http://ecx.images-amazon.com/images/I/81SJwuaCspL._UX342_.jpg":["342","445"],"http://ecx.images-amazon.com/images/I/81SJwuaCspL._UX466_.jpg":["466","606"],"http://ecx.images-amazon.com/images/I/81SJwuaCspL._UY550_.jpg":["423","550"]}}],"White":[{"large":"http://ecx.images-amazon.com/images/I/41tElK2wPKL.jpg","variant":"MAIN","hiRes":"http://ecx.images-amazon.com/images/I/81kKgU75rIL._UL1500_.jpg","thumb":"http://ecx.images-amazon.com/images/I/41tElK2wPKL._SR38,50_.jpg","main":{"http://ecx.images-amazon.com/images/I/81kKgU75rIL._UY550_.jpg":["423","550"],"http://ecx.images-amazon.com/images/I/81kKgU75rIL._UX522_.jpg":["522","679"],"http://ecx.images-amazon.com/images/I/81kKgU75rIL._UY500_.jpg":["385","500"],"http://ecx.images-amazon.com/images/I/81kKgU75rIL._UX342_.jpg":["342","445"],"http://ecx.images-amazon.com/images/I/81kKgU75rIL._UX466_.jpg":["466","606"]}},{"large":"http://ecx.images-amazon.com/images/I/31lEDIs4cqL.jpg","variant":"BACK","hiRes":"http://ecx.images-amazon.com/images/I/81OBgvbUR7L._UL1500_.jpg","thumb":"http://ecx.images-amazon.com/images/I/31lEDIs4cqL._SR38,50_.jpg","main":{"http://ecx.images-amazon.com/images/I/81OBgvbUR7L._UX466_.jpg":["466","606"],"http://ecx.images-amazon.com/images/I/81OBgvbUR7L._UX342_.jpg":["342","445"],"http://ecx.images-amazon.com/images/I/81OBgvbUR7L._UX522_.jpg":["522","679"],"http://ecx.images-amazon.com/images/I/81OBgvbUR7L._UY500_.jpg":["385","500"],"http://ecx.images-amazon.com/images/I/81OBgvbUR7L._UY550_.jpg":["423","550"]}}],"Deep Blue Sea":[{"large":"http://ecx.images-amazon.com/images/I/41oNq3KmSGL.jpg","variant":"MAIN","hiRes":"http://ecx.images-amazon.com/images/I/81MtZtmxVLL._UL1500_.jpg","thumb":"http://ecx.images-amazon.com/images/I/41oNq3KmSGL._SR38,50_.jpg","main":{"http://ecx.images-amazon.com/images/I/81MtZtmxVLL._UX342_.jpg":["342","445"],"http://ecx.images-amazon.com/images/I/81MtZtmxVLL._UX522_.jpg":["522","679"],"http://ecx.images-amazon.com/images/I/81MtZtmxVLL._UY550_.jpg":["423","550"],"http://ecx.images-amazon.com/images/I/81MtZtmxVLL._UY500_.jpg":["385","500"],"http://ecx.images-amazon.com/images/I/81MtZtmxVLL._UX466_.jpg":["466","606"]}},{"large":"http://ecx.images-amazon.com/images/I/41AJgd1OuYL.jpg","variant":"BACK","hiRes":"http://ecx.images-amazon.com/images/I/81uLEksrYFL._UL1500_.jpg","thumb":"http://ecx.images-amazon.com/images/I/41AJgd1OuYL._SR38,50_.jpg","main":{"http://ecx.images-amazon.com/images/I/81uLEksrYFL._UX342_.jpg":["342","445"],"http://ecx.images-amazon.com/images/I/81uLEksrYFL._UY500_.jpg":["385","500"],"http://ecx.images-amazon.com/images/I/81uLEksrYFL._UX522_.jpg":["522","679"],"http://ecx.images-amazon.com/images/I/81uLEksrYFL._UX466_.jpg":["466","606"],"http://ecx.images-amazon.com/images/I/81uLEksrYFL._UY550_.jpg":["423","550"]}}],"Sea Glass":[{"large":"http://ecx.images-amazon.com/images/I/418vg-re8oL.jpg","variant":"MAIN","hiRes":"http://ecx.images-amazon.com/images/I/81YgtD-bEwL._UL1500_.jpg","thumb":"http://ecx.images-amazon.com/images/I/418vg-re8oL._SR38,50_.jpg","main":{"http://ecx.images-amazon.com/images/I/81YgtD-bEwL._UX342_.jpg":["342","445"],"http://ecx.images-amazon.com/images/I/81YgtD-bEwL._UX522_.jpg":["522","679"],"http://ecx.images-amazon.com/images/I/81YgtD-bEwL._UX466_.jpg":["466","606"],"http://ecx.images-amazon.com/images/I/81YgtD-bEwL._UY500_.jpg":["385","500"],"http://ecx.images-amazon.com/images/I/81YgtD-bEwL._UY550_.jpg":["423","550"]}},{"large":"http://ecx.images-amazon.com/images/I/41lcpC41VSL.jpg","variant":"BACK","hiRes":"http://ecx.images-amazon.com/images/I/814%2B6ZLwIxL._UL1500_.jpg","thumb":"http://ecx.images-amazon.com/images/I/41lcpC41VSL._SR38,50_.jpg","main":{"http://ecx.images-amazon.com/images/I/814%2B6ZLwIxL._UY500_.jpg":["385","500"],"http://ecx.images-amazon.com/images/I/814%2B6ZLwIxL._UX342_.jpg":["342","445"],"http://ecx.images-amazon.com/images/I/814%2B6ZLwIxL._UX522_.jpg":["522","679"],"http://ecx.images-amazon.com/images/I/814%2B6ZLwIxL._UX466_.jpg":["466","606"],"http://ecx.images-amazon.com/images/I/814%2B6ZLwIxL._UY550_.jpg":["423","550"]}}]};
  data["heroImage"] = {};
  data["landingAsinColor"] = 'Coral Dreams';
  data["shouldApplyResizeFix"] = false;

  return data;
});

</script>

我想抓取的文件名没有src(即http://ecx.images-amazon.com/images/I/81%2BTW8762BL._UY500_.jpg) 在这种情况下,数组称为 data["colorImages"]。但我不能硬编码任何东西,因为同样的事情发生在 eBay 上.

我在这里需要的文件名在 enImgCarousel 中。

附带说明一下,当我为每个 URL 使用以下 JavaScript 小书签来获取图像时,我能够获取正确的图像:

a='';
for (b=0;b<document.images.length;b++){
  a+='<img src='+document.images[b].src+'><br>'};
  ifa=''){
    document.writea+'</center>');
    void(document.close())
  }else{
    alert('No images!')
  }

回到 Nokogiri 和 XPath,我也尝试过:

tmp2.xpath("//img").each do |src|...

 tmp2.xpath("html//img").each do |src|

有什么想法我应该怎么做或朝哪个方向发展?

最佳答案

这是解决您想要的问题的替代方法;你可以使用 Capybara 和 Poltergeist。

我假设您不必使用此解决方案深入研究 JavaScript。

如果你刮,我建议你考虑 Capybara 和 Poltergeist,你可以找到很多引用资料。

这是我试过的代码:

require 'capybara'
require 'capybara/dsl'
require 'capybara/poltergeist'

Capybara.register_driver :poltergeist_debug do |app|
  Capybara::Poltergeist::Driver.new(app, inspector: true)
end

Capybara.javascript_driver = :poltergeist_debug
Capybara.current_driver = :poltergeist_debug 

# Amazon Case
visit_site('https://www.amazon.com/dp/B00T46V758/?tag=stackoverfl08-20')
doc_amazon = Nokogiri::HTML.parse(page.html)
doc_amazon.xpath("//img/@src").each do |src|
  p src.value  
end 

#ebay case
visit_site('https://www.ebay.com/itm/Summer-Women-Casual-Chiffon-Loose-Tops-Batwing-Short-Sleeve-Loose-T-Shirt-Blouse-/351411949784?pt=LH_DefaultDomain_0&var=&hash=item51d1c8d0d8')    
doc_ebay = Nokogiri::HTML.parse(page.html)
doc_ebay.xpath("//img/@src").each do |src|
  p src.value  
end 

如果你想深入研究它:

doc.xpath("//div[@id='imgTagWrapperId']/img").attribute('src').value
# => "https://images-na.ssl-images-amazon.com/images/I/81%2BTW8762BL._UX453_.jpg"

 doc.xpath("//div[@id='mainImgHldr']/img[@id='icImg']").attribute('src').value
# => "https://i.ebayimg.com/images/g/dtAAAOSwpdpVZuU~/s-l300.jpg"

关于ruby-on-rails - 如何使用 JSON 的 Nokogiri 中的 XPath 从 eBay 和亚马逊抓取图像,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/30682191/

相关文章:

sql-server - 如何在sql中为xml的所有子节点添加属性

c# - 根据 XPath 中的属性值选择当前节点的子节点?

ruby-on-rails - 无法从 https ://rubygems. org/即使在 ubuntu 14.04 上的 gem 更新系统后获取数据

ruby-on-rails - 未定义的方法skip_confirmation! - 设计,omniauth

ruby-on-rails - Rails/Paperclip - 跳过图像处理

ruby-on-rails - Rails 要求 "end",但一切都有结束?

ruby 类未定义方法 (NoMethodError)

ruby-on-rails - Bundle 不想安装 gem(尚未 checkout )

ruby-on-rails - Rails 4,如何使用行号记录文件

eclipse - JAXB 绑定(bind) XBRL 元素不起作用