From 84090ca8bc10853db10000940b72604a0d7c293a Mon Sep 17 00:00:00 2001 From: steve Date: Sat, 6 Jul 2024 16:16:14 +0100 Subject: Initial release --- README.md | 9 ++++ badurls.txt | 6 +++ lobsterstags.txt | 9 ++++ minlobsters.py | 22 +++++++++ minreddit.py | 18 +++++++ redditsubs.txt | 5 ++ requirements.txt | 7 +++ spider.py | 140 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 216 insertions(+) create mode 100644 README.md create mode 100644 badurls.txt create mode 100644 lobsterstags.txt create mode 100644 minlobsters.py create mode 100644 minreddit.py create mode 100644 redditsubs.txt create mode 100644 requirements.txt create mode 100644 spider.py diff --git a/README.md b/README.md new file mode 100644 index 0000000..4453e4e --- /dev/null +++ b/README.md @@ -0,0 +1,9 @@ +# barkingspider + +A bot that pulls down new content from Atom, RSS and JSON sources, filters it based on criteria then spits out a bunch of links. + +Name from Roger's Profanisaurus: + +**Barking Spider** + +n. Ringpiece; anus; chocolate starfish (qv). \ No newline at end of file diff --git a/badurls.txt b/badurls.txt new file mode 100644 index 0000000..d439c59 --- /dev/null +++ b/badurls.txt @@ -0,0 +1,6 @@ +twitter.com +https://x.com +https://www.reddit.com +imgurl.com +i.redd.it +troyhunt.com diff --git a/lobsterstags.txt b/lobsterstags.txt new file mode 100644 index 0000000..ad81f9a --- /dev/null +++ b/lobsterstags.txt @@ -0,0 +1,9 @@ +security:8 +virtualization:9 +reversing:5 +privacy:9 +openbsd:14 +cryptography:6 +osdev:6 +networking:13 +compsci:16 diff --git a/minlobsters.py b/minlobsters.py new file mode 100644 index 0000000..e383535 --- /dev/null +++ b/minlobsters.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python3 + +import sys, getopt +import requests + +from selectolax.parser import HTMLParser + +def extract_lobsters(tag): + base_url = f"https://lobste.rs/t/{tag}" + r = requests.get(base_url, headers = {'User-agent': 'yourbot'}) + + h = HTMLParser(r.text) + items = h.css('ol.stories.list li') # Our parent CSS class, each article is in an li + for i in items: + score = i.css("div.score")[0].text() # .text() recovers the text between tags + title = i.css("span.link a")[0].text() + url = i.css("span.link a")[0].attrs['href'] # .attrs recovers attribute content (ie href here) + + print(f"\"{title}\" - {url} ({score})") + +tag = "reversing" +extract_lobsters(tag) diff --git a/minreddit.py b/minreddit.py new file mode 100644 index 0000000..73ddd41 --- /dev/null +++ b/minreddit.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python3 + +import sys, getopt +import requests + +def extract_reddit(tag): + base_url = f"https://www.reddit.com/r/{sub}.json" + r = requests.get(base_url, headers = {'User-agent': 'yourbot'}) + data = r.json()['data']['children'] + + for i in data: + d = i['data'] + title, url, score = d['title'], d['url'], d['score'] + + print(f"\"{title}\" - {url} ({score})") + +sub = "blueteamsec" +extract_reddit(sub) diff --git a/redditsubs.txt b/redditsubs.txt new file mode 100644 index 0000000..61b7109 --- /dev/null +++ b/redditsubs.txt @@ -0,0 +1,5 @@ +ReverseEngineering:1 +redteamsec:8 +blueteamsec:12 +netsec:30 +cybersecurity:100 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e64a534 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +certifi==2024.2.2 +charset-normalizer==3.3.2 +diskcache==5.6.3 +idna==3.7 +requests==2.31.0 +selectolax==0.3.21 +urllib3==2.2.1 diff --git a/spider.py b/spider.py new file mode 100644 index 0000000..0474c41 --- /dev/null +++ b/spider.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python3 + +import sys, getopt +import requests + +from diskcache import Index +from selectolax.parser import HTMLParser + +DEBUG = False +USAGE = f"Usage: python {sys.argv[0]} [--help] | [--version] | [--reddit] | [--lobsters]" +VERSION = f"{sys.argv[0]} version 1.0.0" + +subs = {} +tags = {} +limit = 100 +timeframe = 'week' #hour, day, week, month, year, all +listing = 'top' # controversial, best, hot, new, random, rising, top +badurls = [] +posts = Index('data/results') +newposts = {} +reddit = False +lobsters = False + +def parse(): + config = {} + options, arguments = getopt.getopt( + sys.argv[1:], # Arguments + 'vrlh', # Short option definitions + ["version", "reddit", "lobsters", "help"]) # Long option definitions + separator = "\n" + for o, a in options: + if o in ("-v", "--version"): + print(VERSION) + sys.exit() + if o in ("-r", "--reddit"): + config['reddit'] = True + if o in ("-l", "--lobsters"): + config['lobsters'] = True + if o in ("-h", "--help"): + print(USAGE) + sys.exit() + if not options: + raise SystemExit(USAGE) + try: + operands = [int(arg) for arg in arguments] + except ValueError: + raise SystemExit(USAGE) + return separator, operands, config + + +def get_reddit(subreddit,listing,limit,timeframe): + try: + base_url = f'https://www.reddit.com/r/{subreddit}/{listing}.json?limit={limit}&t={timeframe}' + request = requests.get(base_url, headers = {'User-agent': 'yourbot'}) + except: + print('An Error Occured') + return request.json() + +def process_reddit(sub, data, min_score): + for i in data: + d = i['data'] + if (d['score'] >= min_score): + title, url, score = d['title'], d['url'], d['score'] + + butest = [s for s in badurls if s.lower() in url.lower()] + if butest: + if DEBUG: + print(f"{url} is in badurls {butest}") + else: + post = {'title': title, 'source': f"reddit:{sub}", 'url': url, 'score': score} + if not url in posts: + newposts[url] = post + + if newposts: + for k, p in newposts.items(): + posts[p['url']] = p + +def extract_reddits(subs): + for sub, min_score in subs.items(): + r = get_reddit(sub, listing, limit, timeframe) + data = r['data']['children'] + process_reddit(sub, data, min_score) + +def extract_lobsters(tags): + taglist = ",".join(tags.keys()) + try: + base_url = f"https://lobste.rs/t/{taglist}" + r = requests.get(base_url, headers = {'User-agent': 'yourbot'}) + except: + print('An Error extracting from lobste.rs occured') + sys.exit(-1) + + h = HTMLParser(r.text) + items = h.css('ol.stories.list li') + for i in items: + score = i.css("div.score")[0].text() + title = i.css("span.link a")[0].text() + url = i.css("span.link a")[0].attrs['href'] + tags = ", ".join([x.text() for x in i.css("span.tags a")]) + post = {'title': title, 'source': f"lobsters:{tags}", 'url': url, 'score': score} + if not url in posts: + newposts[url] = post + + if newposts: + for k, p in newposts.items(): + posts[p['url']] = p + + +def dump_data(): + if DEBUG: + print("Subs identified:") + for k,v in subs.items(): + print(f"Sub: {k} - min_score: {v}") + + print("\n") + + for k,p in newposts.items(): + print(f"Src: \"{p['source']}\" Title: \"{p['title']}\" Url: {p['url']} Score: {p['score']}") + +# main +separator, operands, config = parse() + +with open("badurls.txt") as f: + badurls = f.read().splitlines() + +if config.get("reddit"): + with open("redditsubs.txt") as f: + for l in f: + sub,min_score = l.partition(":")[::2] + subs[sub.strip()] = int(min_score) + extract_reddits(subs) + +if config.get("lobsters"): + with open("lobsterstags.txt") as f: + for l in f: + tag,min_score = l.partition(":")[::2] + tags[tag.strip()] = int(min_score) + extract_lobsters(tags) + +dump_data() -- cgit v1.2.3