huan/rss.py

#!/usr/bin/python3
# -*- coding: utf-8 -*-

import feedparser
import time

def check():
   feed_name = 'fefe'
   url = 'https://blog.fefe.de/rss.xml'

   db = 'rss_feeds.db'
   limit = 12 * 3600 * 1000

   blogresult = None

   #
   # function to get the current time
   #
   current_time_millis = lambda: int(round(time.time() * 1000))
   current_timestamp = current_time_millis()

   def post_is_in_db(title):
       with open(db, 'r') as database:
           for line in database:
               if title in line:
                   return True
           database.close()
       return False

   # return true if the title is in the database with a timestamp > limit
   def post_is_in_db_with_old_timestamp(title):
       with open(db, 'r') as database:
           for line in database:
               if title in line:
                   ts_as_string = line.split('|', 1)[1]
                   ts = int(ts_as_string)
                   if current_timestamp - ts > limit:
                       return True
           database.close()
       return False

   #
   # get the feed data from the url
   #
   feed = feedparser.parse(url)

   #
   # figure out which posts to print
   #
   posts_to_print = []
   posts_to_skip = []

   for post in feed.entries:
       # if post is already in the database, skip it
       # TODO check the time
       title = post.title.split(".")[0]
       link = post.link

       if post_is_in_db_with_old_timestamp(title):
           posts_to_skip.append(title+";"+link)
       else:
           posts_to_print.append(title+";"+link)

   #
   # add all the posts we're going to print to the database with the current timestamp
   # (but only if they're not already in there)
   #
   f = open(db, 'a')
   for line in posts_to_print:
       content = line.split(";")
       try:
         title = content[0]
         link  = content[1]
         if not post_is_in_db(title):
             f.write(title + "|" + str(current_timestamp) + "\n")
             blogresult = str(title)+" - "+str(link)
       except:
         title = content[0].encode("cp1252").decode("utf-8", "ignore")
         link  = content[1].encode("cp1252").decode("utf-8", "ignore")
         if not post_is_in_db(title):
             f.write(title + "|" + str(current_timestamp) + "\n")
             blogresult = str(title)+" - "+str(link)
   f.close

   del feed

   return blogresult


def main():
   check()

if __name__ == "__main__":
   main()