FT Recipe 10/27

It is an accumulative updates which includes a bunch of bug fixes, aesthetic changes, etc. I have done over the past few weeks. Hopefully it helps :)

Code:

__license__   = 'GPL v3'

__copyright__ = '2010-2011, Darko Miletic <darko.miletic at gmail.com>'

'''

www.ft.com/uk-edition

'''



import datetime

from calibre.ptempfile import PersistentTemporaryFile

from calibre import strftime

from calibre.web.feeds.news import BasicNewsRecipe



class FinancialTimes(BasicNewsRecipe):

    title                 = 'Financial Times (UK)'

    __author__            = 'Darko Miletic'

    description           = "The Financial Times (FT) is one of the world's leading business news and information organisations, recognised internationally for its authority, integrity and accuracy."

    publisher             = 'The Financial Times Ltd.'

    category              = 'news, finances, politics, UK, World'

    oldest_article        = 2

    language              = 'en_GB'

    max_articles_per_feed = 250

    no_stylesheets        = True

    use_embedded_content  = False

    needs_subscription    = True

    encoding              = 'utf8'

    publication_type      = 'newspaper'

    articles_are_obfuscated = True

    temp_files              = []

    masthead_url          = 'http://im.media.ft.com/m/img/masthead_main.jpg'

    LOGIN                 = 'https://registration.ft.com/registration/barrier/login'

    LOGIN2                = 'http://media.ft.com/h/subs3.html'

    INDEX                 = 'http://www.ft.com/uk-edition'

    PREFIX                = 'http://www.ft.com'



    conversion_options = {

                          'comment'          : description

                        , 'tags'             : category

                        , 'publisher'        : publisher

                        , 'language'         : language

                        , 'linearize_tables' : True

                        }



    def get_browser(self):

        br = BasicNewsRecipe.get_browser()

        br.open(self.INDEX)

        br.open(self.LOGIN)

        br.select_form(name='loginForm')

            br['username'] = self.username

            br['password'] = self.password

        br.submit()

        return br



    keep_only_tags = [

                        dict(name='div', attrs={'class':['fullstory fullstoryHeader', 'ft-story-header']})

                       ,dict(name='div', attrs={'class':'standfirst'})

                       ,dict(name='div', attrs={'id'   :'storyContent'})

                       ,dict(name='div', attrs={'class':['ft-story-body','index-detail']})

                     ]

    remove_tags = [

                      dict(name='div', attrs={'id':'floating-con'})

                     ,dict(name=['meta','iframe','base','object','embed','link'])

                     ,dict(attrs={'class':['storyTools','story-package','screen-copy','story-package separator','expandable-image']})

                  ]

    remove_attributes = ['width','height','lang']



    extra_css = """

                body{font-family: Georgia,Times,"Times New Roman",serif}

                h2{font-size:large}

                .ft-story-header{font-size: x-small}

                .container{font-size:x-small;}

                h3{font-size:x-small;color:#003399;}

                .copyright{font-size: x-small}

                img{margin-top: 0.8em; display: block}

                .lastUpdated{font-family: Arial,Helvetica,sans-serif; font-size: x-small}

                .byline,.ft-story-body,.ft-story-header{font-family: Arial,Helvetica,sans-serif}

                """



    def get_artlinks(self, elem):

        articles = []

        count = 0

        for item in elem.findAll('a',href=True):

            count = count + 1

            if self.test and count > 2:

               return articles

            rawlink = item['href']

            if rawlink.startswith('http://'):

               url = rawlink

            else:

               url   = self.PREFIX + rawlink

            try:

                urlverified = self.browser.open_novisit(url).geturl() # resolve redirect.

            except:

                continue

            title = self.tag_to_string(item)

            date = strftime(self.timefmt)

            articles.append({

                              'title'      :title

                             ,'date'       :date

                             ,'url'        :urlverified

                             ,'description':''

                            })

        return articles



    def parse_index(self):

        feeds = []

        soup = self.index_to_soup(self.INDEX)

        dates= self.tag_to_string(soup.find('div', attrs={'class':'btm-links'}).find('div'))

        self.timefmt = ' [%s]'%dates

        wide = soup.find('div',attrs={'class':'wide'})

        if not wide:

           return feeds

        strest = wide.findAll('h3', attrs={'class':'section'})

        if not strest:

           return feeds

        st = wide.findAll('h4',attrs={'class':'section-no-arrow'})

        if st:

           st.extend(strest)

        count = 0

        for item in st:

            count = count + 1

            if self.test and count > 2:

               return feeds

            ftitle   = self.tag_to_string(item)

            self.report_progress(0, _('Fetching feed')+' %s...'%(ftitle))

            if item.parent.ul is not None:

                    feedarts = self.get_artlinks(item.parent.ul)

            feeds.append((ftitle,feedarts))

        return feeds



    def preprocess_html(self, soup):

        items = ['promo-box','promo-title',

                 'promo-headline','promo-image',

                 'promo-intro','promo-link','subhead']

        for item in items:

            for it in soup.findAll(item):

                it.name = 'div'

                it.attrs = []

        for item in soup.findAll(style=True):

            del item['style']

        for item in soup.findAll('a'):

            limg = item.find('img')

            if item.string is not None:

               str = item.string

               item.replaceWith(str)

            else:

               if limg:

                  item.name = 'div'

                  item.attrs = []

               else:

                   str = self.tag_to_string(item)

                   item.replaceWith(str)

        for item in soup.findAll('img'):

            if not item.has_key('alt'):

               item['alt'] = 'image'

        return soup



    def get_cover_url(self):

        cdate = datetime.date.today()

        if cdate.isoweekday() == 7:

           cdate -= datetime.timedelta(days=1)

        return cdate.strftime('http://specials.ft.com/vtf_pdf/%d%m%y_FRONT1_LON.pdf')



    def get_obfuscated_article(self, url):

        count = 0

        while (count < 10):

            try:

                response = self.browser.open(url)

                html = response.read()

                count = 10

            except:

                print "Retrying download..."

            count += 1        

        self.temp_files.append(PersistentTemporaryFile('_fa.html'))

        self.temp_files[-1].write(html)

        self.temp_files[-1].close()

        return self.temp_files[-1].name

       

    def cleanup(self):

        self.browser.open('https://registration.ft.com/registration/login/logout?location=')

FT Recipe 10/27

Trending Articles

Bath man appears in court charged with attempted murder of a man...

MACLEAN, Allan

Black Angus Grilled Artichokes

Practice Sheet of Right form of verbs for HSC Students

Police blotter for Jan. 12

99 God Status for Whatsapp, Facebook

Rajasthan Board 12th Science Result 2018 name wise- RBSE 12th commerce result...

Notorious Naushad of Ippa gang nabbed

Child Kidnapping: Amy McNeil was kidnapped on her way to school by 5 adults;...

Sonible Smartlimit v1.1.5-R2R

NCERT Solutions for Class 9th Sanskrit Chapter 3 पाथेयम्

मतलबी दोस्त स्टेट्स | Matlabi Dost Status in Hindi – Selfish Friends Status

Arrow Flash 2 – Sinhala Dubbed – Episode 23 – 20th March 2016

[GET] AI Traffic Goldmine

[E² Plugin] HDF-Radio

Universal Multi-Patch v1.3 By RADIXX11

IWAN – Thanks and Praise ( Throw Back Thursday )

RONALD P SONDERGAARD Arrested by Miami-Dade County Corrections on Mar 03, 2017

मुख मैथुन से उठाएं सेक्स का भरपूर मज़ा, जानें क्या है इसका सही तरीकामुख मैथुन...

HSSC Excise & Taxation Inspector Result 2017 Scorecard/ Category Wise Merit List