diff options
| author | steve <steve@haxors.club> | 2024-07-06 16:16:14 +0100 |
|---|---|---|
| committer | steve <steve@haxors.club> | 2024-07-06 16:16:14 +0100 |
| commit | 84090ca8bc10853db10000940b72604a0d7c293a (patch) | |
| tree | 4da8717c69f6e2420d748a6b2dd4430494a0fc50 | |
| download | barkingspider-84090ca8bc10853db10000940b72604a0d7c293a.tar.gz barkingspider-84090ca8bc10853db10000940b72604a0d7c293a.tar.bz2 barkingspider-84090ca8bc10853db10000940b72604a0d7c293a.zip | |
Initial release
| -rw-r--r-- | README.md | 9 | ||||
| -rw-r--r-- | badurls.txt | 6 | ||||
| -rw-r--r-- | lobsterstags.txt | 9 | ||||
| -rw-r--r-- | minlobsters.py | 22 | ||||
| -rw-r--r-- | minreddit.py | 18 | ||||
| -rw-r--r-- | redditsubs.txt | 5 | ||||
| -rw-r--r-- | requirements.txt | 7 | ||||
| -rw-r--r-- | spider.py | 140 |
8 files changed, 216 insertions, 0 deletions
diff --git a/README.md b/README.md new file mode 100644 index 0000000..4453e4e --- /dev/null +++ b/README.md @@ -0,0 +1,9 @@ +# barkingspider + +A bot that pulls down new content from Atom, RSS and JSON sources, filters it based on criteria then spits out a bunch of links.
+
+Name from Roger's Profanisaurus:
+
+**Barking Spider**
+
+n. Ringpiece; anus; chocolate starfish (qv).
\ No newline at end of file diff --git a/badurls.txt b/badurls.txt new file mode 100644 index 0000000..d439c59 --- /dev/null +++ b/badurls.txt @@ -0,0 +1,6 @@ +twitter.com +https://x.com +https://www.reddit.com +imgurl.com +i.redd.it +troyhunt.com diff --git a/lobsterstags.txt b/lobsterstags.txt new file mode 100644 index 0000000..ad81f9a --- /dev/null +++ b/lobsterstags.txt @@ -0,0 +1,9 @@ +security:8 +virtualization:9 +reversing:5 +privacy:9 +openbsd:14 +cryptography:6 +osdev:6 +networking:13 +compsci:16 diff --git a/minlobsters.py b/minlobsters.py new file mode 100644 index 0000000..e383535 --- /dev/null +++ b/minlobsters.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python3 + +import sys, getopt +import requests + +from selectolax.parser import HTMLParser + +def extract_lobsters(tag): + base_url = f"https://lobste.rs/t/{tag}" + r = requests.get(base_url, headers = {'User-agent': 'yourbot'}) + + h = HTMLParser(r.text) + items = h.css('ol.stories.list li') # Our parent CSS class, each article is in an li + for i in items: + score = i.css("div.score")[0].text() # .text() recovers the text between tags + title = i.css("span.link a")[0].text() + url = i.css("span.link a")[0].attrs['href'] # .attrs recovers attribute content (ie href here) + + print(f"\"{title}\" - {url} ({score})") + +tag = "reversing" +extract_lobsters(tag) diff --git a/minreddit.py b/minreddit.py new file mode 100644 index 0000000..73ddd41 --- /dev/null +++ b/minreddit.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python3 + +import sys, getopt +import requests + +def extract_reddit(tag): + base_url = f"https://www.reddit.com/r/{sub}.json" + r = requests.get(base_url, headers = {'User-agent': 'yourbot'}) + data = r.json()['data']['children'] + + for i in data: + d = i['data'] + title, url, score = d['title'], d['url'], d['score'] + + print(f"\"{title}\" - {url} ({score})") + +sub = "blueteamsec" +extract_reddit(sub) diff --git a/redditsubs.txt b/redditsubs.txt new file mode 100644 index 0000000..61b7109 --- /dev/null +++ b/redditsubs.txt @@ -0,0 +1,5 @@ +ReverseEngineering:1 +redteamsec:8 +blueteamsec:12 +netsec:30 +cybersecurity:100 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e64a534 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +certifi==2024.2.2 +charset-normalizer==3.3.2 +diskcache==5.6.3 +idna==3.7 +requests==2.31.0 +selectolax==0.3.21 +urllib3==2.2.1 diff --git a/spider.py b/spider.py new file mode 100644 index 0000000..0474c41 --- /dev/null +++ b/spider.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python3 + +import sys, getopt +import requests + +from diskcache import Index +from selectolax.parser import HTMLParser + +DEBUG = False +USAGE = f"Usage: python {sys.argv[0]} [--help] | [--version] | [--reddit] | [--lobsters]" +VERSION = f"{sys.argv[0]} version 1.0.0" + +subs = {} +tags = {} +limit = 100 +timeframe = 'week' #hour, day, week, month, year, all +listing = 'top' # controversial, best, hot, new, random, rising, top +badurls = [] +posts = Index('data/results') +newposts = {} +reddit = False +lobsters = False + +def parse(): + config = {} + options, arguments = getopt.getopt( + sys.argv[1:], # Arguments + 'vrlh', # Short option definitions + ["version", "reddit", "lobsters", "help"]) # Long option definitions + separator = "\n" + for o, a in options: + if o in ("-v", "--version"): + print(VERSION) + sys.exit() + if o in ("-r", "--reddit"): + config['reddit'] = True + if o in ("-l", "--lobsters"): + config['lobsters'] = True + if o in ("-h", "--help"): + print(USAGE) + sys.exit() + if not options: + raise SystemExit(USAGE) + try: + operands = [int(arg) for arg in arguments] + except ValueError: + raise SystemExit(USAGE) + return separator, operands, config + + +def get_reddit(subreddit,listing,limit,timeframe): + try: + base_url = f'https://www.reddit.com/r/{subreddit}/{listing}.json?limit={limit}&t={timeframe}' + request = requests.get(base_url, headers = {'User-agent': 'yourbot'}) + except: + print('An Error Occured') + return request.json() + +def process_reddit(sub, data, min_score): + for i in data: + d = i['data'] + if (d['score'] >= min_score): + title, url, score = d['title'], d['url'], d['score'] + + butest = [s for s in badurls if s.lower() in url.lower()] + if butest: + if DEBUG: + print(f"{url} is in badurls {butest}") + else: + post = {'title': title, 'source': f"reddit:{sub}", 'url': url, 'score': score} + if not url in posts: + newposts[url] = post + + if newposts: + for k, p in newposts.items(): + posts[p['url']] = p + +def extract_reddits(subs): + for sub, min_score in subs.items(): + r = get_reddit(sub, listing, limit, timeframe) + data = r['data']['children'] + process_reddit(sub, data, min_score) + +def extract_lobsters(tags): + taglist = ",".join(tags.keys()) + try: + base_url = f"https://lobste.rs/t/{taglist}" + r = requests.get(base_url, headers = {'User-agent': 'yourbot'}) + except: + print('An Error extracting from lobste.rs occured') + sys.exit(-1) + + h = HTMLParser(r.text) + items = h.css('ol.stories.list li') + for i in items: + score = i.css("div.score")[0].text() + title = i.css("span.link a")[0].text() + url = i.css("span.link a")[0].attrs['href'] + tags = ", ".join([x.text() for x in i.css("span.tags a")]) + post = {'title': title, 'source': f"lobsters:{tags}", 'url': url, 'score': score} + if not url in posts: + newposts[url] = post + + if newposts: + for k, p in newposts.items(): + posts[p['url']] = p + + +def dump_data(): + if DEBUG: + print("Subs identified:") + for k,v in subs.items(): + print(f"Sub: {k} - min_score: {v}") + + print("\n") + + for k,p in newposts.items(): + print(f"Src: \"{p['source']}\" Title: \"{p['title']}\" Url: {p['url']} Score: {p['score']}") + +# main +separator, operands, config = parse() + +with open("badurls.txt") as f: + badurls = f.read().splitlines() + +if config.get("reddit"): + with open("redditsubs.txt") as f: + for l in f: + sub,min_score = l.partition(":")[::2] + subs[sub.strip()] = int(min_score) + extract_reddits(subs) + +if config.get("lobsters"): + with open("lobsterstags.txt") as f: + for l in f: + tag,min_score = l.partition(":")[::2] + tags[tag.strip()] = int(min_score) + extract_lobsters(tags) + +dump_data() |
