#!/usr/bin/env python3 import sys, getopt import requests from diskcache import Index from selectolax.parser import HTMLParser DEBUG = False USAGE = f"Usage: python {sys.argv[0]} [--help] | [--version] | [--reddit] | [--lobsters]" VERSION = f"{sys.argv[0]} version 1.0.0" subs = {} tags = {} limit = 100 timeframe = 'week' #hour, day, week, month, year, all listing = 'top' # controversial, best, hot, new, random, rising, top badurls = [] posts = Index('data/results') newposts = {} reddit = False lobsters = False def parse(): config = {} options, arguments = getopt.getopt( sys.argv[1:], # Arguments 'vrlh', # Short option definitions ["version", "reddit", "lobsters", "help"]) # Long option definitions separator = "\n" for o, a in options: if o in ("-v", "--version"): print(VERSION) sys.exit() if o in ("-r", "--reddit"): config['reddit'] = True if o in ("-l", "--lobsters"): config['lobsters'] = True if o in ("-h", "--help"): print(USAGE) sys.exit() if not options: raise SystemExit(USAGE) try: operands = [int(arg) for arg in arguments] except ValueError: raise SystemExit(USAGE) return separator, operands, config def get_reddit(subreddit,listing,limit,timeframe): try: base_url = f'https://www.reddit.com/r/{subreddit}/{listing}.json?limit={limit}&t={timeframe}' request = requests.get(base_url, headers = {'User-agent': 'yourbot'}) except: print('An Error Occured') return request.json() def process_reddit(sub, data, min_score): for i in data: d = i['data'] if (d['score'] >= min_score): title, url, score = d['title'], d['url'], d['score'] butest = [s for s in badurls if s.lower() in url.lower()] if butest: if DEBUG: print(f"{url} is in badurls {butest}") else: post = {'title': title, 'source': f"reddit:{sub}", 'url': url, 'score': score} if not url in posts: newposts[url] = post if newposts: for k, p in newposts.items(): posts[p['url']] = p def extract_reddits(subs): for sub, min_score in subs.items(): r = get_reddit(sub, listing, limit, timeframe) data = r['data']['children'] process_reddit(sub, data, min_score) def extract_lobsters(tags): taglist = ",".join(tags.keys()) try: base_url = f"https://lobste.rs/t/{taglist}" r = requests.get(base_url, headers = {'User-agent': 'yourbot'}) except: print('An Error extracting from lobste.rs occured') sys.exit(-1) h = HTMLParser(r.text) items = h.css('ol.stories.list li') for i in items: score = i.css("div.score")[0].text() title = i.css("span.link a")[0].text() url = i.css("span.link a")[0].attrs['href'] tags = ", ".join([x.text() for x in i.css("span.tags a")]) post = {'title': title, 'source': f"lobsters:{tags}", 'url': url, 'score': score} if not url in posts: newposts[url] = post if newposts: for k, p in newposts.items(): posts[p['url']] = p def dump_data(): if DEBUG: print("Subs identified:") for k,v in subs.items(): print(f"Sub: {k} - min_score: {v}") print("\n") for k,p in newposts.items(): print(f"Src: \"{p['source']}\" Title: \"{p['title']}\" Url: {p['url']} Score: {p['score']}") # main separator, operands, config = parse() with open("badurls.txt") as f: badurls = f.read().splitlines() if config.get("reddit"): with open("redditsubs.txt") as f: for l in f: sub,min_score = l.partition(":")[::2] subs[sub.strip()] = int(min_score) extract_reddits(subs) if config.get("lobsters"): with open("lobsterstags.txt") as f: for l in f: tag,min_score = l.partition(":")[::2] tags[tag.strip()] = int(min_score) extract_lobsters(tags) dump_data()