Initial release

author: steve <steve@haxors.club> 2024-07-06 16:16:14 +0100
committer: steve <steve@haxors.club> 2024-07-06 16:16:14 +0100
commit: 84090ca8bc10853db10000940b72604a0d7c293a (patch)
tree: 4da8717c69f6e2420d748a6b2dd4430494a0fc50 /spider.py
download: barkingspider-84090ca8bc10853db10000940b72604a0d7c293a.tar.gz
barkingspider-84090ca8bc10853db10000940b72604a0d7c293a.tar.bz2
barkingspider-84090ca8bc10853db10000940b72604a0d7c293a.zip
1 files changed, 140 insertions, 0 deletions
diff --git a/spider.py b/spider.py
new file mode 100644
index 0000000..0474c41
--- /dev/null
+++ b/spider.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python3
+
+import sys, getopt
+import requests
+
+from diskcache import Index
+from selectolax.parser import HTMLParser
+
+DEBUG = False
+USAGE = f"Usage: python {sys.argv[0]} [--help] | [--version] | [--reddit] | [--lobsters]"
+VERSION = f"{sys.argv[0]} version 1.0.0"
+
+subs = {}
+tags = {}
+limit = 100
+timeframe = 'week' #hour, day, week, month, year, all
+listing = 'top' # controversial, best, hot, new, random, rising, top
+badurls = []
+posts = Index('data/results')
+newposts = {}
+reddit = False
+lobsters = False
+
+def parse():
+    config = {}
+    options, arguments = getopt.getopt(
+        sys.argv[1:],                      # Arguments
+        'vrlh',                            # Short option definitions
+        ["version", "reddit", "lobsters", "help"]) # Long option definitions
+    separator = "\n"
+    for o, a in options:
+        if o in ("-v", "--version"):
+            print(VERSION)
+            sys.exit()
+        if o in ("-r", "--reddit"):
+            config['reddit'] = True
+        if o in ("-l", "--lobsters"):
+            config['lobsters'] = True
+        if o in ("-h", "--help"):
+            print(USAGE)
+            sys.exit()
+    if not options:
+        raise SystemExit(USAGE)
+    try:
+        operands = [int(arg) for arg in arguments]
+    except ValueError:
+        raise SystemExit(USAGE)
+    return separator, operands, config
+
+
+def get_reddit(subreddit,listing,limit,timeframe):
+    try:
+        base_url = f'https://www.reddit.com/r/{subreddit}/{listing}.json?limit={limit}&t={timeframe}'
+        request = requests.get(base_url, headers = {'User-agent': 'yourbot'})
+    except:
+        print('An Error Occured')
+    return request.json()
+
+def process_reddit(sub, data, min_score):
+    for i in data:
+        d = i['data']
+        if (d['score'] >= min_score):
+            title, url, score = d['title'], d['url'], d['score']
+
+            butest = [s for s in badurls if s.lower() in url.lower()]
+            if butest:
+                if DEBUG:
+                    print(f"{url} is in badurls {butest}")
+            else:
+                post = {'title': title, 'source': f"reddit:{sub}", 'url': url, 'score': score}
+                if not url in posts:
+                    newposts[url] = post
+
+    if newposts:
+        for k, p in newposts.items():
+            posts[p['url']] = p
+
+def extract_reddits(subs):
+    for sub, min_score in subs.items():
+        r = get_reddit(sub, listing, limit, timeframe)
+        data = r['data']['children']
+        process_reddit(sub, data, min_score)
+
+def extract_lobsters(tags):
+    taglist = ",".join(tags.keys())
+    try:
+        base_url = f"https://lobste.rs/t/{taglist}"
+        r = requests.get(base_url, headers = {'User-agent': 'yourbot'})
+    except:
+        print('An Error extracting from lobste.rs occured')
+        sys.exit(-1)
+
+    h = HTMLParser(r.text)
+    items = h.css('ol.stories.list li')
+    for i in items:
+        score = i.css("div.score")[0].text()
+        title = i.css("span.link a")[0].text()
+        url = i.css("span.link a")[0].attrs['href']
+        tags = ", ".join([x.text() for x in i.css("span.tags a")])
+        post = {'title': title, 'source': f"lobsters:{tags}", 'url': url, 'score': score}
+        if not url in posts:
+            newposts[url] = post
+
+    if newposts:
+        for k, p in newposts.items():
+            posts[p['url']] = p
+
+
+def dump_data():
+    if DEBUG:
+        print("Subs identified:")
+        for k,v in subs.items():
+            print(f"Sub: {k} - min_score: {v}")
+
+        print("\n")
+
+    for k,p in newposts.items():
+        print(f"Src: \"{p['source']}\" Title: \"{p['title']}\" Url: {p['url']} Score: {p['score']}")
+
+# main
+separator, operands, config = parse()
+
+with open("badurls.txt") as f:
+    badurls = f.read().splitlines()
+
+if config.get("reddit"):
+    with open("redditsubs.txt") as f:
+        for l in f:
+            sub,min_score = l.partition(":")[::2]
+            subs[sub.strip()] = int(min_score)
+    extract_reddits(subs)
+
+if config.get("lobsters"):
+    with open("lobsterstags.txt") as f:
+        for l in f:
+            tag,min_score = l.partition(":")[::2]
+            tags[tag.strip()] = int(min_score)
+    extract_lobsters(tags)
+
+dump_data()
author	steve <steve@haxors.club>	2024-07-06 16:16:14 +0100
committer	steve <steve@haxors.club>	2024-07-06 16:16:14 +0100
commit	84090ca8bc10853db10000940b72604a0d7c293a (patch)
tree	4da8717c69f6e2420d748a6b2dd4430494a0fc50 /spider.py
download	barkingspider-84090ca8bc10853db10000940b72604a0d7c293a.tar.gz barkingspider-84090ca8bc10853db10000940b72604a0d7c293a.tar.bz2 barkingspider-84090ca8bc10853db10000940b72604a0d7c293a.zip