8 files changed, 216 insertions, 0 deletions
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..4453e4e
--- /dev/null
+++ b/README.md
@@ -0,0 +1,9 @@
+# barkingspider
+
+A bot that pulls down new content from Atom, RSS and JSON sources, filters it based on criteria then spits out a bunch of links.
+
+Name from Roger's Profanisaurus:
+
+**Barking Spider**
+
+n. Ringpiece; anus; chocolate starfish (qv).
+\ No newline at end of file
diff --git a/badurls.txt b/badurls.txt
new file mode 100644
index 0000000..d439c59
--- /dev/null
+++ b/badurls.txt
@@ -0,0 +1,6 @@
+twitter.com
+https://x.com
+https://www.reddit.com
+imgurl.com
+i.redd.it
+troyhunt.com
diff --git a/lobsterstags.txt b/lobsterstags.txt
new file mode 100644
index 0000000..ad81f9a
--- /dev/null
+++ b/lobsterstags.txt
@@ -0,0 +1,9 @@
+security:8
+virtualization:9
+reversing:5
+privacy:9
+openbsd:14
+cryptography:6
+osdev:6
+networking:13
+compsci:16
diff --git a/minlobsters.py b/minlobsters.py
new file mode 100644
index 0000000..e383535
--- /dev/null
+++ b/minlobsters.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+
+import sys, getopt
+import requests
+
+from selectolax.parser import HTMLParser
+
+def extract_lobsters(tag):
+    base_url = f"https://lobste.rs/t/{tag}"
+    r = requests.get(base_url, headers = {'User-agent': 'yourbot'})
+
+    h = HTMLParser(r.text)
+    items = h.css('ol.stories.list li') # Our parent CSS class, each article is in an li
+    for i in items:
+        score = i.css("div.score")[0].text() # .text() recovers the text between tags
+        title = i.css("span.link a")[0].text()
+        url = i.css("span.link a")[0].attrs['href'] # .attrs recovers attribute content (ie href here)
+
+        print(f"\"{title}\" - {url} ({score})")
+
+tag = "reversing"
+extract_lobsters(tag)
diff --git a/minreddit.py b/minreddit.py
new file mode 100644
index 0000000..73ddd41
--- /dev/null
+++ b/minreddit.py
@@ -0,0 +1,18 @@
+#!/usr/bin/env python3
+
+import sys, getopt
+import requests
+
+def extract_reddit(tag):
+    base_url = f"https://www.reddit.com/r/{sub}.json"
+    r = requests.get(base_url, headers = {'User-agent': 'yourbot'})
+    data = r.json()['data']['children']
+
+    for i in data:
+        d = i['data']
+        title, url, score = d['title'], d['url'], d['score']
+
+        print(f"\"{title}\" - {url} ({score})")
+
+sub = "blueteamsec"
+extract_reddit(sub)
diff --git a/redditsubs.txt b/redditsubs.txt
new file mode 100644
index 0000000..61b7109
--- /dev/null
+++ b/redditsubs.txt
@@ -0,0 +1,5 @@
+ReverseEngineering:1
+redteamsec:8
+blueteamsec:12
+netsec:30
+cybersecurity:100
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..e64a534
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,7 @@
+certifi==2024.2.2
+charset-normalizer==3.3.2
+diskcache==5.6.3
+idna==3.7
+requests==2.31.0
+selectolax==0.3.21
+urllib3==2.2.1
diff --git a/spider.py b/spider.py
new file mode 100644
index 0000000..0474c41
--- /dev/null
+++ b/spider.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python3
+
+import sys, getopt
+import requests
+
+from diskcache import Index
+from selectolax.parser import HTMLParser
+
+DEBUG = False
+USAGE = f"Usage: python {sys.argv[0]} [--help] | [--version] | [--reddit] | [--lobsters]"
+VERSION = f"{sys.argv[0]} version 1.0.0"
+
+subs = {}
+tags = {}
+limit = 100
+timeframe = 'week' #hour, day, week, month, year, all
+listing = 'top' # controversial, best, hot, new, random, rising, top
+badurls = []
+posts = Index('data/results')
+newposts = {}
+reddit = False
+lobsters = False
+
+def parse():
+    config = {}
+    options, arguments = getopt.getopt(
+        sys.argv[1:],                      # Arguments
+        'vrlh',                            # Short option definitions
+        ["version", "reddit", "lobsters", "help"]) # Long option definitions
+    separator = "\n"
+    for o, a in options:
+        if o in ("-v", "--version"):
+            print(VERSION)
+            sys.exit()
+        if o in ("-r", "--reddit"):
+            config['reddit'] = True
+        if o in ("-l", "--lobsters"):
+            config['lobsters'] = True
+        if o in ("-h", "--help"):
+            print(USAGE)
+            sys.exit()
+    if not options:
+        raise SystemExit(USAGE)
+    try:
+        operands = [int(arg) for arg in arguments]
+    except ValueError:
+        raise SystemExit(USAGE)
+    return separator, operands, config
+
+
+def get_reddit(subreddit,listing,limit,timeframe):
+    try:
+        base_url = f'https://www.reddit.com/r/{subreddit}/{listing}.json?limit={limit}&t={timeframe}'
+        request = requests.get(base_url, headers = {'User-agent': 'yourbot'})
+    except:
+        print('An Error Occured')
+    return request.json()
+
+def process_reddit(sub, data, min_score):
+    for i in data:
+        d = i['data']
+        if (d['score'] >= min_score):
+            title, url, score = d['title'], d['url'], d['score']
+
+            butest = [s for s in badurls if s.lower() in url.lower()]
+            if butest:
+                if DEBUG:
+                    print(f"{url} is in badurls {butest}")
+            else:
+                post = {'title': title, 'source': f"reddit:{sub}", 'url': url, 'score': score}
+                if not url in posts:
+                    newposts[url] = post
+
+    if newposts:
+        for k, p in newposts.items():
+            posts[p['url']] = p
+
+def extract_reddits(subs):
+    for sub, min_score in subs.items():
+        r = get_reddit(sub, listing, limit, timeframe)
+        data = r['data']['children']
+        process_reddit(sub, data, min_score)
+
+def extract_lobsters(tags):
+    taglist = ",".join(tags.keys())
+    try:
+        base_url = f"https://lobste.rs/t/{taglist}"
+        r = requests.get(base_url, headers = {'User-agent': 'yourbot'})
+    except:
+        print('An Error extracting from lobste.rs occured')
+        sys.exit(-1)
+
+    h = HTMLParser(r.text)
+    items = h.css('ol.stories.list li')
+    for i in items:
+        score = i.css("div.score")[0].text()
+        title = i.css("span.link a")[0].text()
+        url = i.css("span.link a")[0].attrs['href']
+        tags = ", ".join([x.text() for x in i.css("span.tags a")])
+        post = {'title': title, 'source': f"lobsters:{tags}", 'url': url, 'score': score}
+        if not url in posts:
+            newposts[url] = post
+
+    if newposts:
+        for k, p in newposts.items():
+            posts[p['url']] = p
+
+
+def dump_data():
+    if DEBUG:
+        print("Subs identified:")
+        for k,v in subs.items():
+            print(f"Sub: {k} - min_score: {v}")
+
+        print("\n")
+
+    for k,p in newposts.items():
+        print(f"Src: \"{p['source']}\" Title: \"{p['title']}\" Url: {p['url']} Score: {p['score']}")
+
+# main
+separator, operands, config = parse()
+
+with open("badurls.txt") as f:
+    badurls = f.read().splitlines()
+
+if config.get("reddit"):
+    with open("redditsubs.txt") as f:
+        for l in f:
+            sub,min_score = l.partition(":")[::2]
+            subs[sub.strip()] = int(min_score)
+    extract_reddits(subs)
+
+if config.get("lobsters"):
+    with open("lobsterstags.txt") as f:
+        for l in f:
+            tag,min_score = l.partition(":")[::2]
+            tags[tag.strip()] = int(min_score)
+    extract_lobsters(tags)
+
+dump_data()