Python Web Scraping
We'll be reviewing this code, and talking about unicode, and encoding issues in Python, and open source databases
import urllib2
import codecs
import BeautifulSoup
import re
import pdb
import os
class GetBBC:
def __init__(self):
print "In constructor"
self.language_links = []
self.dir = ‘BBC_Language_pages’
try:
os.makedirs(self.dir)
except OSError:
pass
def getLanguageChoices(self):
lang_page = urllib2.urlopen(”http://www.bbc.co.uk/worldservice/languages/”).read()
self.soup = BeautifulSoup.BeautifulSoup(lang_page)
# match langtexttop too
links = self.soup.findAll(attrs={’class’:re.compile(’^langtext*’)})
for x in links:
self.language_links.append(x)
print “Appending %s with link %s ” % (x.a.string,x.a['href'])
print “There are %d language choices for the BBC news page!” % len(self.language_links)
def archiveLanguagePages(self):
os.chdir(self.dir)
for x in self.language_links:
lang_page = urllib2.urlopen(’http://www.bbc.co.uk’ + x.a['href']).read()
clean_page = BeautifulSoup.BeautifulSoup(lang_page).prettify()
rawfile = codecs.open(x.a.string,’wb+’,'ISO8859-1′)
rawfile.write(unicode(clean_page,’ISO8859-1′))
rawfile.close()
print “Saved the %s page.” % x.a.string
os.chdir(’..’)
def readLanguagePage(self,language):
os.chdir(self.dir)
rawfile = codecs.open(language,’rb’,'ISO8859-1′)
file = rawfile.read()
rawfile.close()
os.chdir(’..’)
return rawfile
if __name__ == “__main__”:
x=GetBBC()
x.getLanguageChoices()
x.archiveLanguagePages()
y = x.readLanguagePage(’Portuguese’)