Python Web Scraping

We'll be reviewing this code, and talking about unicode, and encoding issues in Python, and open source databases

import urllib2
import codecs
import BeautifulSoup
import re
import pdb
import os

class GetBBC:
def __init__(self):
print "In constructor"
self.language_links = []
self.dir = ‘BBC_Language_pages’
try:
os.makedirs(self.dir)
except OSError:
pass

def getLanguageChoices(self):
lang_page = urllib2.urlopen(”http://www.bbc.co.uk/worldservice/languages/”).read()
self.soup = BeautifulSoup.BeautifulSoup(lang_page)
# match langtexttop too
links = self.soup.findAll(attrs={’class’:re.compile(’^langtext*’)})
for x in links:
self.language_links.append(x)
print “Appending %s with link %s ” % (x.a.string,x.a['href'])

print “There are %d language choices for the BBC news page!” % len(self.language_links)

def archiveLanguagePages(self):
os.chdir(self.dir)
for x in self.language_links:
lang_page = urllib2.urlopen(’http://www.bbc.co.uk’ + x.a['href']).read()
clean_page = BeautifulSoup.BeautifulSoup(lang_page).prettify()
rawfile = codecs.open(x.a.string,’wb+’,'ISO8859-1′)
rawfile.write(unicode(clean_page,’ISO8859-1′))
rawfile.close()
print “Saved the %s page.” % x.a.string
os.chdir(’..’)

def readLanguagePage(self,language):
os.chdir(self.dir)
rawfile = codecs.open(language,’rb’,'ISO8859-1′)
file = rawfile.read()
rawfile.close()
os.chdir(’..’)
return rawfile

if __name__ == “__main__”:
x=GetBBC()
x.getLanguageChoices()
x.archiveLanguagePages()
y = x.readLanguagePage(’Portuguese’)