summaryrefslogtreecommitdiffstats
path: root/spider.py
diff options
context:
space:
mode:
authorsteve <steve@haxors.club>2024-07-06 16:16:14 +0100
committersteve <steve@haxors.club>2024-07-06 16:16:14 +0100
commit84090ca8bc10853db10000940b72604a0d7c293a (patch)
tree4da8717c69f6e2420d748a6b2dd4430494a0fc50 /spider.py
downloadbarkingspider-84090ca8bc10853db10000940b72604a0d7c293a.tar.gz
barkingspider-84090ca8bc10853db10000940b72604a0d7c293a.tar.bz2
barkingspider-84090ca8bc10853db10000940b72604a0d7c293a.zip
Initial release
Diffstat (limited to 'spider.py')
-rw-r--r--spider.py140
1 files changed, 140 insertions, 0 deletions
diff --git a/spider.py b/spider.py
new file mode 100644
index 0000000..0474c41
--- /dev/null
+++ b/spider.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python3
+
+import sys, getopt
+import requests
+
+from diskcache import Index
+from selectolax.parser import HTMLParser
+
+DEBUG = False
+USAGE = f"Usage: python {sys.argv[0]} [--help] | [--version] | [--reddit] | [--lobsters]"
+VERSION = f"{sys.argv[0]} version 1.0.0"
+
+subs = {}
+tags = {}
+limit = 100
+timeframe = 'week' #hour, day, week, month, year, all
+listing = 'top' # controversial, best, hot, new, random, rising, top
+badurls = []
+posts = Index('data/results')
+newposts = {}
+reddit = False
+lobsters = False
+
+def parse():
+ config = {}
+ options, arguments = getopt.getopt(
+ sys.argv[1:], # Arguments
+ 'vrlh', # Short option definitions
+ ["version", "reddit", "lobsters", "help"]) # Long option definitions
+ separator = "\n"
+ for o, a in options:
+ if o in ("-v", "--version"):
+ print(VERSION)
+ sys.exit()
+ if o in ("-r", "--reddit"):
+ config['reddit'] = True
+ if o in ("-l", "--lobsters"):
+ config['lobsters'] = True
+ if o in ("-h", "--help"):
+ print(USAGE)
+ sys.exit()
+ if not options:
+ raise SystemExit(USAGE)
+ try:
+ operands = [int(arg) for arg in arguments]
+ except ValueError:
+ raise SystemExit(USAGE)
+ return separator, operands, config
+
+
+def get_reddit(subreddit,listing,limit,timeframe):
+ try:
+ base_url = f'https://www.reddit.com/r/{subreddit}/{listing}.json?limit={limit}&t={timeframe}'
+ request = requests.get(base_url, headers = {'User-agent': 'yourbot'})
+ except:
+ print('An Error Occured')
+ return request.json()
+
+def process_reddit(sub, data, min_score):
+ for i in data:
+ d = i['data']
+ if (d['score'] >= min_score):
+ title, url, score = d['title'], d['url'], d['score']
+
+ butest = [s for s in badurls if s.lower() in url.lower()]
+ if butest:
+ if DEBUG:
+ print(f"{url} is in badurls {butest}")
+ else:
+ post = {'title': title, 'source': f"reddit:{sub}", 'url': url, 'score': score}
+ if not url in posts:
+ newposts[url] = post
+
+ if newposts:
+ for k, p in newposts.items():
+ posts[p['url']] = p
+
+def extract_reddits(subs):
+ for sub, min_score in subs.items():
+ r = get_reddit(sub, listing, limit, timeframe)
+ data = r['data']['children']
+ process_reddit(sub, data, min_score)
+
+def extract_lobsters(tags):
+ taglist = ",".join(tags.keys())
+ try:
+ base_url = f"https://lobste.rs/t/{taglist}"
+ r = requests.get(base_url, headers = {'User-agent': 'yourbot'})
+ except:
+ print('An Error extracting from lobste.rs occured')
+ sys.exit(-1)
+
+ h = HTMLParser(r.text)
+ items = h.css('ol.stories.list li')
+ for i in items:
+ score = i.css("div.score")[0].text()
+ title = i.css("span.link a")[0].text()
+ url = i.css("span.link a")[0].attrs['href']
+ tags = ", ".join([x.text() for x in i.css("span.tags a")])
+ post = {'title': title, 'source': f"lobsters:{tags}", 'url': url, 'score': score}
+ if not url in posts:
+ newposts[url] = post
+
+ if newposts:
+ for k, p in newposts.items():
+ posts[p['url']] = p
+
+
+def dump_data():
+ if DEBUG:
+ print("Subs identified:")
+ for k,v in subs.items():
+ print(f"Sub: {k} - min_score: {v}")
+
+ print("\n")
+
+ for k,p in newposts.items():
+ print(f"Src: \"{p['source']}\" Title: \"{p['title']}\" Url: {p['url']} Score: {p['score']}")
+
+# main
+separator, operands, config = parse()
+
+with open("badurls.txt") as f:
+ badurls = f.read().splitlines()
+
+if config.get("reddit"):
+ with open("redditsubs.txt") as f:
+ for l in f:
+ sub,min_score = l.partition(":")[::2]
+ subs[sub.strip()] = int(min_score)
+ extract_reddits(subs)
+
+if config.get("lobsters"):
+ with open("lobsterstags.txt") as f:
+ for l in f:
+ tag,min_score = l.partition(":")[::2]
+ tags[tag.strip()] = int(min_score)
+ extract_lobsters(tags)
+
+dump_data()