javascript - 需要一些使用 CasperJS 进行抓取的帮助

标签 javascript web-scraping phantomjs casperjs

这是我目前拥有的:

var casper = require('casper').create();

var fs = require('fs');
var folderName = 'CARD_DATA';
var fileName = 'championDecks.txt';
var save = fs.pathJoin(fs.workingDirectory, folderName, fileName);

// init jquery 
var casper = require('casper').create({
    clientScripts: ['jquery.min.js']
});

// parse URL
var parseUrl = 'http://magic.wizards.com/en/events/coverage/mtgochamp14';

// scrape
function getDeckData() {
    var meta = $('.deck-meta h4').text();
    var event = $('.deck-meta h5').text().trim();

    return [meta, event];
}

casper.start(parseUrl, function() {
    var data = this.evaluate(getDeckData);

    fs.write(save, data + '\n', 'w');
});

casper.run();

我正在尝试抓取http://magic.wizards.com/en/events/coverage/mtgochamp14格式与此类似:

{
    "event": "2014 Magic Online Championship",
    "deckName": "(Vintage) Magnus Lantto's Pyromancer Control",
    "deck": [
        "1 Dack Fayden",
        "3 Snapcaster Mage",
        "4 Young Pyromancer",
        "3 Cabal Therapy",
        "1 Demonic Tutor",
        "4 Gitaxian Probe",
        "1 Ponder",
        "3 Preordain",
        "1 Time Walk",
        "1 Treasure Cruise",
        "1 Ancestral Recall",
        "1 Brainstorm",
        "3 Dig Through Time",
        "4 Force of Will",
        "3 Gush",
        "3 Lightning Bolt",
        "4 Mental Misstep",
        "1 Pyroblast",
        "1 Black Lotus",
        "1 Mox Jet",
        "1 Mox Ruby",
        "1 Mox Sapphire",
        "3 Flooded Strand",
        "1 Island",
        "4 Scalding Tarn",
        "1 Strip Mine",
        "2 Underground Sea",
        "3 Volcanic Island"
    ],
    "sideboard": [
        "1 Pyroblast",
        "1 Dread of Night",
        "1 Electrickery",
        "4 Grafdigger's Cage",
        "4 Ingot Chewer",
        "1 Mountain",
        "1 Null Rod",
        "2 Pulverize"
    ],
    "event": "2014 Magic Online Championship",
    "deckName": "(MODERN) MAGNUS LANTTO'S ELF COMPANY",
    "deck": [ ... ],
    "sideboard": [ ... ]

    ...and so on...
}

我似乎不知道如何单独获取数据。这是我目前得到的:

(Vintage) Magnus Lantto's Pyromancer Control(Modern) Magnus Lantto's Elf Company(Standard) Magnus Lantto's Atarka Devotion(Vintage) Jasper de Jong's Mentor Control(Modern) Jasper de Jong's Melira and Company(Standard) Jasper de Jong's Green-White Devotion(Vintage) Aleksa Telarov's Delver(Modern) Aleksa Telarov's Burn(Standard) Aleksa Telarov's Jund Megamorph(Vintage) Antonio Del Moral León's Omni-Oath(Modern) Antonio Del Moral León's Splinter Twin(Standard) Antonio Del Moral León's Abzan Midrange,2014 Magic Online Championship                        
                      2014 Magic Online Championship                        
                      2014 Magic Online Championship                        
                      2014 Magic Online Championship                        
                      2014 Magic Online Championship                        
                      2014 Magic Online Championship                        
                      2014 Magic Online Championship                        
                      2014 Magic Online Championship                        
                      2014 Magic Online Championship                        
                      2014 Magic Online Championship                        
                      2014 Magic Online Championship                        
                      2014 Magic Online Championship

任何人都可以提供一点智慧来引导我走向正确的方向吗?

最佳答案

假设每个套牌名称都有一个事件:

var meta = $('.deck-meta h4');
var event = $('.deck-meta h5');
var output = [];

for(var i = 0; i < meta.length; i++) {
    output.push({
        deckName: $(meta[i]).text(),
        event: $(event[i]).text().trim()
    });    
}

return output;

关于javascript - 需要一些使用 CasperJS 进行抓取的帮助,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/31174774/

相关文章:

python - 列表索引必须是整数,而不是元组

python - 从 scrapy.selector 导入选择器错误

visual-studio-2013 - Chutzpah 测试适配器在哪里安装 PhantomJS?

javascript - 未捕获的 TypeError : textLocation[p]. find 不是一个函数

python - 如何仅抓取特定单词

javascript - click() 的替代方法

php - Codeception多项测试,1个脚本

javascript - 如何根据给定的开始日期和期限持续时间动态设置结束日期?

Javascript:从对象数组调用函数

javascript - xml 添加属性转义 &