diff options
Diffstat (limited to 'spider.py')
| -rw-r--r-- | spider.py | 140 |
1 files changed, 140 insertions, 0 deletions
diff --git a/spider.py b/spider.py new file mode 100644 index 0000000..0474c41 --- /dev/null +++ b/spider.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python3 + +import sys, getopt +import requests + +from diskcache import Index +from selectolax.parser import HTMLParser + +DEBUG = False +USAGE = f"Usage: python {sys.argv[0]} [--help] | [--version] | [--reddit] | [--lobsters]" +VERSION = f"{sys.argv[0]} version 1.0.0" + +subs = {} +tags = {} +limit = 100 +timeframe = 'week' #hour, day, week, month, year, all +listing = 'top' # controversial, best, hot, new, random, rising, top +badurls = [] +posts = Index('data/results') +newposts = {} +reddit = False +lobsters = False + +def parse(): + config = {} + options, arguments = getopt.getopt( + sys.argv[1:], # Arguments + 'vrlh', # Short option definitions + ["version", "reddit", "lobsters", "help"]) # Long option definitions + separator = "\n" + for o, a in options: + if o in ("-v", "--version"): + print(VERSION) + sys.exit() + if o in ("-r", "--reddit"): + config['reddit'] = True + if o in ("-l", "--lobsters"): + config['lobsters'] = True + if o in ("-h", "--help"): + print(USAGE) + sys.exit() + if not options: + raise SystemExit(USAGE) + try: + operands = [int(arg) for arg in arguments] + except ValueError: + raise SystemExit(USAGE) + return separator, operands, config + + +def get_reddit(subreddit,listing,limit,timeframe): + try: + base_url = f'https://www.reddit.com/r/{subreddit}/{listing}.json?limit={limit}&t={timeframe}' + request = requests.get(base_url, headers = {'User-agent': 'yourbot'}) + except: + print('An Error Occured') + return request.json() + +def process_reddit(sub, data, min_score): + for i in data: + d = i['data'] + if (d['score'] >= min_score): + title, url, score = d['title'], d['url'], d['score'] + + butest = [s for s in badurls if s.lower() in url.lower()] + if butest: + if DEBUG: + print(f"{url} is in badurls {butest}") + else: + post = {'title': title, 'source': f"reddit:{sub}", 'url': url, 'score': score} + if not url in posts: + newposts[url] = post + + if newposts: + for k, p in newposts.items(): + posts[p['url']] = p + +def extract_reddits(subs): + for sub, min_score in subs.items(): + r = get_reddit(sub, listing, limit, timeframe) + data = r['data']['children'] + process_reddit(sub, data, min_score) + +def extract_lobsters(tags): + taglist = ",".join(tags.keys()) + try: + base_url = f"https://lobste.rs/t/{taglist}" + r = requests.get(base_url, headers = {'User-agent': 'yourbot'}) + except: + print('An Error extracting from lobste.rs occured') + sys.exit(-1) + + h = HTMLParser(r.text) + items = h.css('ol.stories.list li') + for i in items: + score = i.css("div.score")[0].text() + title = i.css("span.link a")[0].text() + url = i.css("span.link a")[0].attrs['href'] + tags = ", ".join([x.text() for x in i.css("span.tags a")]) + post = {'title': title, 'source': f"lobsters:{tags}", 'url': url, 'score': score} + if not url in posts: + newposts[url] = post + + if newposts: + for k, p in newposts.items(): + posts[p['url']] = p + + +def dump_data(): + if DEBUG: + print("Subs identified:") + for k,v in subs.items(): + print(f"Sub: {k} - min_score: {v}") + + print("\n") + + for k,p in newposts.items(): + print(f"Src: \"{p['source']}\" Title: \"{p['title']}\" Url: {p['url']} Score: {p['score']}") + +# main +separator, operands, config = parse() + +with open("badurls.txt") as f: + badurls = f.read().splitlines() + +if config.get("reddit"): + with open("redditsubs.txt") as f: + for l in f: + sub,min_score = l.partition(":")[::2] + subs[sub.strip()] = int(min_score) + extract_reddits(subs) + +if config.get("lobsters"): + with open("lobsterstags.txt") as f: + for l in f: + tag,min_score = l.partition(":")[::2] + tags[tag.strip()] = int(min_score) + extract_lobsters(tags) + +dump_data() |
