这是页面源中的一个脚本标记,我想使用 scrapy 从中提取 mp4: 列表中的字符串。我无法将其加载到 json 加载器中,并且找不到任何其他方法来执行此操作。无法弄清楚它的 xpath。
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>RikTak Video Player - Version 1</title>
<script src="https://cdn.radiantmediatechs.com/rmp/5.2.1/js/rmp.min.js"></script>
<style>
body {
margin: 0;
}
</style>
</head>
<body>
<div id="rmpPlayer"></div>
<script>
var bitrates = {
mp4: ['https://mvd8.ddns.me:443/viewm/52/653/52653.mp4?wmsAuthSign=c2VydmVyX3RpbWU9MTAvMjMvMjAxOSA2OjI2OjAzIFBNJmhhc2hfdmFsdWU9ODlyM3FWTlRONldQWGJOT3JWQWJTUT09JnZhbGlkbWludXRlcz02MA==']
};
var schedule = {
preroll: [
'https://googleads.g.doubleclick.net/pagead/ads?ad_type=video_text_image&client=ca-video-pub-1231661633440980&description_url=https%3A%2F%2Fwww.farfeshplus.com&channel=7962520214&videoad_start_delay=0&hl=ar'
],
midroll: [
[600,'https://googleads.g.doubleclick.net/pagead/ads?ad_type=video_text_image&client=ca-video-pub-1231661633440980&description_url=https%3A%2F%2Fwww.farfeshplus.com&channel=7962520214&videoad_start_delay=0&hl=ar'],
[1200,'https://pubads.g.doubleclick.net/gampad/ads?iu=/60345044/Pirsom_Ayoub_LTD_TOP/farfeshplus/farfeshplus_Preroll&description_url=https%3A%2F%2Fwww.farfeshplus.com%2F&env=vp&impl=s&correlator=&tfcd=0&npa=0&gdfp_req=1&output=vast&sz=640x480&unviewed_position_start=1'],
[1800,'https://googleads.g.doubleclick.net/pagead/ads?ad_type=video_text_image&client=ca-video-pub-1231661633440980&description_url=https%3A%2F%2Fwww.farfeshplus.com&channel=7962520214&videoad_start_delay=0&hl=ar']
],
postroll: [
'https://pubads.g.doubleclick.net/gampad/ads?iu=/60345044/Pirsom_Ayoub_LTD_TOP/farfeshplus/farfeshplus_Preroll&description_url=https%3A%2F%2Fwww.farfeshplus.com%2F&env=vp&impl=s&correlator=&tfcd=0&npa=0&gdfp_req=1&output=vast&sz=640x480&unviewed_position_start=1'
]
};
var settings = {
licenseKey: 'Kl8lNHNrNzkyY3M5dj9yb201ZGFzaXMzMGRiMEElXyo=',
bitrates: bitrates,
delayToFade: 3000,
width: 750,
height: 440,
skin: 's4',
poster: 'https://images.farfeshplus.com/videos/lrg/laila_m_29.jpg',
ads: true,
adSchedule: schedule
};
var elementID = 'rmpPlayer';
var rmp = new RadiantMP(elementID);
rmp.init(settings);
</script>
</body>
</html>
指导我提取此数据的一些方法
最佳答案
首先您应该选择right selector将脚本标签信息提取为文本。
text = url.xpath('//body/script/text()').get()
然后你可以使用正则表达式来查找你想要的内容。
import re
mp4 = re.compile(r"(?<=mp4:\s\[')(.*)'\]")
print(mp4.findall(text)[0])
请参阅 @CypherX 以获取与 beautifullsoup 相同的结果。
输出
https://mvd8.ddns.me:443/viewm/88/686/88686.mp4?wmsAuthSign=c2VydmVyX3RpbWU9MTAvMjMvMjAxOSAzOjMwOjE3IFBNJmhhc2hfdmFsdWU9UXgrZ1dHTWxhVGdNM0Iyd3dSeHJBdz09JnZhbGlkbWludXRlcz02MA==
数据
text = """
<script>
var bitrates = {
mp4: ['https://mvd8.ddns.me:443/viewm/88/686/88686.mp4?wmsAuthSign=c2VydmVyX3RpbWU9MTAvMjMvMjAxOSAzOjMwOjE3IFBNJmhhc2hfdmFsdWU9UXgrZ1dHTWxhVGdNM0Iyd3dSeHJBdz09JnZhbGlkbWludXRlcz02MA==']
};
var schedule = {
preroll: [
'https://googleads.g.doubleclick.net/pagead/ads?ad_type=video_text_image&client=ca-video-pub-1231661633440980&description_url=https%3A%2F%2Fwww.farfeshplus.com&channel=7962520214&videoad_start_delay=0&hl=ar'
],
midroll: [
[600,'https://googleads.g.doubleclick.net/pagead/ads?ad_type=video_text_image&client=ca-video-pub-1231661633440980&description_url=https%3A%2F%2Fwww.farfeshplus.com&channel=7962520214&videoad_start_delay=0&hl=ar'],
[1200,'https://pubads.g.doubleclick.net/gampad/ads?iu=/60345044/Pirsom_Ayoub_LTD_TOP/farfeshplus/farfeshplus_Preroll&description_url=https%3A%2F%2Fwww.farfeshplus.com%2F&env=vp&impl=s&correlator=&tfcd=0&npa=0&gdfp_req=1&output=vast&sz=640x480&unviewed_position_start=1'],
[1800,'https://googleads.g.doubleclick.net/pagead/ads?ad_type=video_text_image&client=ca-video-pub-1231661633440980&description_url=https%3A%2F%2Fwww.farfeshplus.com&channel=7962520214&videoad_start_delay=0&hl=ar']
],
postroll: [
'https://pubads.g.doubleclick.net/gampad/ads?iu=/60345044/Pirsom_Ayoub_LTD_TOP/farfeshplus/farfeshplus_Preroll&description_url=https%3A%2F%2Fwww.farfeshplus.com%2F&env=vp&impl=s&correlator=&tfcd=0&npa=0&gdfp_req=1&output=vast&sz=640x480&unviewed_position_start=1'
]
};
var settings = {
licenseKey: 'Kl8lNHNrNzkyY3M5dj9yb201ZGFzaXMzMGRiMEElXyo=',
bitrates: bitrates,
delayToFade: 3000,
width: 750,
height: 440,
skin: 's4',
poster: 'https://images.farfeshplus.com/videos/lrg/laila_m_29.jpg',
ads: true,
adSchedule: schedule
};
var elementID = 'rmpPlayer';
var rmp = new RadiantMP(elementID);
rmp.init(settings);
</script>
"""
关于python - 使用scrapy从脚本标签中提取数据,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/58526669/