#!/usr/bin/env python3 # -*- coding: utf-8 -*- ''' featherweight – A lightweight terminal news feed reader Copyright © 2013 Mattias Andrée (maandree@member.fsf.org) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . ''' import xml.parsers.expat def parse_feed(feed): ''' Parse a feed file @param feed:str The raw content of the feed file @return :list The feed parsed, one dictionary per channel ''' parser = xml.parsers.expat.ParserCreate() global is_rss, feeds, root, item, text, is_atom, attrs is_rss = False is_atom = False feeds = [] root = None item = None text = None def rss_date(value): value = value.replace('\t', ' ').replace('\n', ' ').replace('\r', ' ') while value.startswith(' '): value = value[1:] while value.endswith(' '): value = value[:-1] while ' ' in value: value = value.replace(' ', ' ') value = value.replace(':', ' ').split(' ') (_, day, month, year, hour, minute, second, offset) = value offsign, offhour, offmin = offset[0] == '+', offset[1 : 3], offset[3 : 5] year, month, day = int(year), month.lower(), int(day) hour, minute, second = int(hour), int(minute), int(second) offhour, offmin = int(offhour), int(offmin) months = ['', 'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'] for m in range(len(months)): if month == months[m]: month = m break if offsign: hour += offhour minute += offmin else: hour -= offhour minute -= offmin while minute < 0: hour -= 1 minute += 60 if minute >= 60: hour += minute // 60 minute %= 60 while hour < 0: day -= 1 hour += 24 if hour >= 24: day += hour // 24 hour %= 24 mds = [0, 31, 30, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] while day <= 0: month -= 1 if month <= 0: month += 12 year -= 1 day += mds[month] while day > mds[month]: day -= mds[month] month += 1 if month > 12: month -= 12 year += 1 return [year, month, day, hour, minute, day] def atom_date(value): value = value.replace(' ', '').replace('\t', '').replace('\n', '').replace('\r', '') value = value.replace('+', 'T+').replace('-', 'T-').replace('Z', 'T+0000') (year, month, day) = value.split('T')[0].split('-') (hour, minute, second) = value.split('T')[1].split(':') offset = value.split('T')[2] offsign, offhour, offmin = offset[0] == '+', offset[1 : 3], offset[3 : 5] year, month, day = int(year), int(month), int(day) hour, minute, second = int(hour), int(minute), int(second) offhour, offmin = int(offhour), int(offmin) if offsign: hour += offhour minute += offmin else: hour -= offhour minute -= offmin while minute < 0: hour -= 1 minute += 60 if minute >= 60: hour += minute // 60 minute %= 60 while hour < 0: day -= 1 hour += 24 if hour >= 24: day += hour // 24 hour %= 24 mds = [0, 31, 30, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] while day <= 0: month -= 1 if month <= 0: month += 12 year -= 1 day += mds[month] while day > mds[month]: day -= mds[month] month += 1 if month > 12: month -= 12 year += 1 return [year, month, day, hour, minute, day] def start_element(name, attributes): global is_rss, feeds, root, item, text, is_atom, attrs attrs = attributes name = name.lower() if is_rss: if root is None: if name == 'channel': root = {'items' : []} else: if item is None: if name == 'item': item = {} elif is_atom: if item is None: if name == 'entry': item = {} elif name == 'rss': is_rss = True elif name == 'feed': is_atom = True root = {'items' : []} text = '' def end_element(name): global is_rss, feeds, root, item, text, is_atom, attrs name = name.lower() if (root is not None) and is_rss: if item is not None: if name == 'item': root['items'].append(item) item = None elif name in ('title', 'description', 'link', 'guid'): item[name] = text elif name == 'pubdate': item['pubdate'] = rss_date(text) else: if name in ('title', 'description', 'link'): root[name] = text elif name == 'channel': feeds.append(root) root = None elif name == 'rss': is_rss = False elif (root is not None) and is_atom: if item is not None: if name == 'entry': root['items'].append(item) item = None elif name == 'title': item['title'] = text elif name == 'id': item['guid'] = text elif name == 'summary': if 'description' not in item: item['description'] = text elif name == 'content': item['description'] = text elif name == 'link': if 'rel' not in attrs: item['link'] = text elif name == 'updated': item['pubdate'] = atom_date(text) else: if name == 'title': root['title'] = text elif name == 'subtitle': root['description'] = text elif name == 'link': if 'rel' not in attrs: root['link'] = text elif name == 'feed': feeds.append(root) root = None is_atom = False text = None def char_data(data): global text if text is not None: text += data parser.StartElementHandler = start_element parser.EndElementHandler = end_element parser.CharacterDataHandler = char_data parser.Parse(feed, True) return feeds