i'm using selenium's node.js api run phantomjs instances against series of web pages. code use execute actions on pages work fine, seems 1 instance of selenium/phantomjs can run @ time. function called multiple times same module , steps through pages in webshop pagination handled client side (which why need selenium/phantomjs environment - extract data each page).
once again, code in , of works fine, can't execute in parallell. causing this?
module.exports = function (crawler, page, parsepage, done) { "use strict"; var _ = require("lodash"), format = require("util").format, path = require("path"), webdriver = require("selenium-webdriver"), = webdriver.by, until = webdriver.until; var phantompath = path.resolve(__dirname, "../node_modules/.bin/phantomjs"), iswin = process.platform === "win32"; var driver = new webdriver.builder() .withcapabilities({ "phantomjs.binary.path": iswin ? phantompath + ".cmd" : phantompath }) .forbrowser("phantomjs") .build(); var windowhandle = new webdriver.webdriver.window(driver); windowhandle.setsize(1100, 1000); var getallpagescontent = function (driver) { var pagescontent = [], pageno = 1; var getnextpage = function () { var nextpagelink; return driver.findelements(by.css(".pagination li")).then(function (elements) { return elements[elements.length - 1]; }).then(function (element) { nextpagelink = element; return element.getattribute("class"); }).then(function (classname) { return _.includes(classname, "active"); }).then(function (islastpage) { return (!islastpage) ? driver.getpagesource() : false; }).then(function (content) { if (content) pagescontent.push(content); content && console.log("got page %d", pageno++); return nextpagelink.findelement(by.css("a")).then(function (element) { return element.click(); }).then(function () { return driver.wait(until.stalenessof(nextpagelink), 10 * 1000); }).then(function () { return content ? getnextpage() : pagescontent; }); }); }; return getnextpage(); }; var processtimeout = settimeout(function () { console.log("phantomjs page %s took long execute", page.url); driver.quit().then(done); }, 60 * 1000); driver.get(page.url).then(function () { var pageoverlay = driver.findelement(by.css("#overlay-the-new")); return pageoverlay.isdisplayed().then(function (visible) { if (visible) { pageoverlay.click(); return driver.wait(until.elementisnotvisible(pageoverlay), 10000); } }).then(function () { return getallpagescontent(driver); }); }).then(function (contents) { cleartimeout(processtimeout); console.log("got %d pages %s", contents.length, page.url); _.foreach(contents, function (pagecontent) { parsepage(page.url, pagecontent); }); return driver.quit(); }).then(function () { done(); }); }
although phantomjs deprecated can still run in parallel isolated docker containers using selenoid. there ready use image latest release here: https://hub.docker.com/r/selenoid/phantomjs/tags/
Comments
Post a Comment