Quantcast
Channel: MobileRead Forums - Calibre
Viewing all articles
Browse latest Browse all 31487

Parsing Index

$
0
0
I'm trying to write some codes to parse the index of businessweek (http://www.businessweek.com/magazine...iness_news.htm). But they didn't work. Can anyone help?

Code:

    def parse_index(self):
        soup = self.index_to_soup(self.INDEX)
        feeds = OrderedDict()
       
        article_list = soup.find('column_container clearfix')
        for section in soup.findAll(attrs={'class':lambda x: x and 'section' in x}):
            h3 = section.find('h3')
            if h3 is None:
                continue
            section_title = self.tag_to_string(h3).strip()
            if not section_title:
                continue
            self.log('Found section: %s'%section_title)
            articles = []
            subsection = ''
            for a in section.findAll('a', href=True):
                url = a['href']
                title = self.tag_to_string(h4).strip()
                if title:
                    self.log('\tFound article:', title)
                    articles.append({'title':title, 'url':url,
                    'description':'', 'date':''})

            if articles and len(articles) > 0 :
                if section_title not in feeds:
                    feeds[section_title] = []
                feeds[section_title] += articles

        ans = [(key, val) for key, val in feeds.iteritems()]
        if not ans:
            raise Exception('Could not find any articles, either the '
                    'businessweek.com server is having trouble and you should '
                    'try later or the website format has changed and the '
                    'recipe needs to be updated.')
        return ans



    def postprocess_html(self, soup, first):
        for item in soup.findAll(attrs={'style':True}):
            del item['style']
        return soup


Viewing all articles
Browse latest Browse all 31487

Trending Articles