Scraping a page with node.js? -


i want flights table every city page befor have set search fields , hit button , befor have import captcha @ first page. web site designed .net. import captcha self want rest nodejs. i'm doing. first, import captcha , loaded page , looking @ firebug, set request header , request url , send using http.request method scrape page nodejs again. after viewstate of page , again usign firebug, set request header , "post data" , request url , send them using http.request scrape final page. information set using firebug, fixed; mean, urls, header options , post data. have change city name in post data. page empty table. if possible, should do? (sorry bad english :) ) url (in persian): http://sepehr.iranhrc.ir. in advance.

var http = require('follow-redirects').http; var querystring = require('querystring'); var cheerio = require('cheerio');  var datatoattach = {     'scriptmanager1': 'uplflightsearch|btnsubmit37756070715319',     '__asyncpost': true,     '__eventargument': '',     '__eventtarget': '',     '__lastfocus': '',     '__viewstate': '',     '__viewstategenerator': 'e4cf65f9',     'btnsubmit37756070715319': '?????',     'dplfrom': 'thr',     'dplreservationroutetype': 'roundtrip_fixeddate',     'dplto': '0',     'dplflightadults': '1',     'dplflightchilds': '0',     'dplflightinfants': '0',     'txtcountup': '00:26',     'txtdeparturedate': '1394/04/02',     'txtreturningdate': '1394/04/04' };  var flightssearchpageros = {     hostname: 'sepehr.iranhrc.ir',     path: '/systems/fa/reservation/flight_newreservation_search.aspx?qry=sbv7wbdq4b7yek1yv0opvmofqkdkbwh49wjk6uimgiw95zdjdgo0/sswjh8wjv1d',     method: 'get',     headers:{         'user-agent': 'mozilla/5.0 (windows nt 6.1; wow64; rv:38.0) gecko/20100101 firefox/38.0',         'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',         'accept-language': 'en-us,en;q=0.5',         'connection': 'keep-alive',         'cache-control': 'no-cache',         'cookie': 'asp.net_sessionid=2iexj4pfxld4mdilfwttka2q;',         'content-type': 'text/html; charset=utf-8',         'host': 'sepehr.iranhrc.ir',         'referer': 'sepehr.iranhrc.ir'     } };  var resultspageros = {     hostname: 'sepehr.iranhrc.ir',     path: '/systems/fa/reservation/flight_newreservation_search.aspx?action=display&rnd=2378726045210585',     method: 'post',     headers:{         'user-agent': 'mozilla/5.0 (windows nt 6.1; wow64; rv:38.0) gecko/20100101 firefox/38.0',         'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',         'accept-language': 'en-us,en;q=0.5',         'cache-control': 'no-cache',         'connection': 'keep-alive',         'cookie': 'asp.net_sessionid=o1ipad335qahuaahc25ngalr;',         'content-length': '',         'content-type': 'application/x-www-form-urlencoded; charset=utf-8',         'referer': 'http://sepehr.iranhrc.ir/systems/fa/reservation/flight_newreservation_search.aspx',         'host': 'sepehr.iranhrc.ir',         'pragma': 'no-cache',         'x-microsoftajax': 'delta=true',         'x-requested-with': 'xmlhttprequest'     } };  var flightssearchpage = http.request(flightssearchpageros, function(response{     var datastream = '';     var htmlcode = '';     var date = '';     response.on('data', function(chunk){         datastream += chunk;     });     response.on('end', function(){         htmlcode = cheerio.load(datastream);         seconddatatoattach.__viewstate = htmlcode("__viewstate").val();         resultspageros.headers['content-length'] = querystring.stringify(datatoattach).length;         resultspagerequest();     }); }); flightssearchpage.on('error', function(e){console.log("error0: " + e.message);}); flightssearchpage.end();  function resultspagerequest(){     var changingcitiesboxresponse = http.request(resultspageros, function(response){         response.setencoding('utf8');         var datastream = '';         var htmlcode = '';         response.on('data', function(chunk){             datastream += chunk;         });         response.on('end', function(){             htmlcode = cheerio.load(datastream);             console.log(htmlcode.html());         });         console.log('status: ' + response.statuscode);     });     changingcitiesboxresponse.on('error', function(e){console.log("error1: " + e.message);});     changingcitiesboxresponse.end(querystring.stringify(datatoattach)); } 

edit

one thing forgot mention that, done in php curl , looked @ code in curl part , done node , http.request. php returns correct answer mine not. , used phantom well.this code:

var url = "http://sepehr.iranhrc.ir/systems/fa/reservation/flight_newreservation_search.aspx?action=display&rnd=4565721642440773"; var settings = {     operation: "post",     encoding: "utf8",     weak: false,     headers: {         'user-agent': 'mozilla/5.0 (windows nt 6.1; wow64; rv:38.0) gecko/20100101 firefox/38.0',         'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',         'accept-language': 'en-us,en;q=0.5',         'cache-control': 'no-cache',         'connection': 'keep-alive',         'cookie': 'asp.net_sessionid=2iexj4pfxld4mdilfwttka2q;',         'content-length': '',         'content-type': 'application/x-www-form-urlencoded; charset=utf-8',         'referer': 'http://sepehr.iranhrc.ir/systems/fa/reservation/flight_newreservation_search.aspx?qry=sbv7wbdq4b7yek1yv0opvmofqkdkbwh49wjk6uimgiw95zdjdgo0/sswjh8wjv1d',         'host': 'sepehr.iranhrc.ir',         'pragma': 'no-cache',         'x-microsoftajax': 'delta=true',         'x-requested-with': 'xmlhttprequest'     },     data: querystring.stringify(seconddatatoattach) };  phantom.create(function (ph){   ph.createpage(function (page){     page.open(url2, settings, function(status){         console.log(status);         page.evaluate(function(){ return document.body.innerhtml; }, function(result){             console.log('content ' + result);             ph.exit();         });     });   }); }, {dnodeopts: {weak: false} }); 

but didn't answer. , yes, uses ajax send request. according firebug: enter image description here first request sending post data , second one, think redirect gives link result page(i used link no luck) , third result page flight details can't get. may used them wrong. how can use these 3 request flights table?

i think what's happening page want results loading them through separate ajax request, , regular request isn't going that. you'll need either figure out separate request , yourself, or scrape page phantomjs can execute javascript page.

first though, go page on browser regularly , watch network tab , locate request loads in data.


Comments