I'm trying to write some codes to parse the index of businessweek (http://www.businessweek.com/magazine...iness_news.htm). But they didn't work. Can anyone help?
Code:
def parse_index(self):
soup = self.index_to_soup(self.INDEX)
feeds = OrderedDict()
article_list = soup.find('column_container clearfix')
for section in soup.findAll(attrs={'class':lambda x: x and 'section' in x}):
h3 = section.find('h3')
if h3 is None:
continue
section_title = self.tag_to_string(h3).strip()
if not section_title:
continue
self.log('Found section: %s'%section_title)
articles = []
subsection = ''
for a in section.findAll('a', href=True):
url = a['href']
title = self.tag_to_string(h4).strip()
if title:
self.log('\tFound article:', title)
articles.append({'title':title, 'url':url,
'description':'', 'date':''})
if articles and len(articles) > 0 :
if section_title not in feeds:
feeds[section_title] = []
feeds[section_title] += articles
ans = [(key, val) for key, val in feeds.iteritems()]
if not ans:
raise Exception('Could not find any articles, either the '
'businessweek.com server is having trouble and you should '
'try later or the website format has changed and the '
'recipe needs to be updated.')
return ans
def postprocess_html(self, soup, first):
for item in soup.findAll(attrs={'style':True}):
del item['style']
return soup