i'm learning how programme , want scrape webpage minus javascript code. i'm following example book. code below should return html code website, returns title of site , javascript code @ bottom. can please let me know went wrong? cheers.
import urllib2 bs4 import beautifulsoup url = "http://www.theurl.com/" page = urllib2.urlopen(url) soup = beautifulsoup(page, "html.parser") [x.extract() x in soup.find_all('script')] print soup.get_text() this returns after title.
var _gaq = _gaq || []; _gaq.push(['_setaccount', 'ua-11092338-1']); _gaq.push(['_trackpageview']); (function() { var ga = document.createelement('script'); ga.type = 'text/javascript'; ga.async = true; ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js'; var s = document.getelementsbytagname('script')[0]; s.parentnode.insertbefore(ga, s); })();
have tried printing soup.contents? because when print soup.get_text(), shall print relatively text. try following code please.
import urllib2 bs4 import beautifulsoup url = "http://www.theurl.com/" page = urllib2.urlopen(url) soup = beautifulsoup(page, "html.parser") [x.extract() x in soup.find_all('script')] html =soup.contents in html: print
Comments
Post a Comment