summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--README.md9
-rw-r--r--badurls.txt6
-rw-r--r--lobsterstags.txt9
-rw-r--r--minlobsters.py22
-rw-r--r--minreddit.py18
-rw-r--r--redditsubs.txt5
-rw-r--r--requirements.txt7
-rw-r--r--spider.py140
8 files changed, 216 insertions, 0 deletions
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..4453e4e
--- /dev/null
+++ b/README.md
@@ -0,0 +1,9 @@
+# barkingspider
+
+A bot that pulls down new content from Atom, RSS and JSON sources, filters it based on criteria then spits out a bunch of links.
+
+Name from Roger's Profanisaurus:
+
+**Barking Spider**
+
+n. Ringpiece; anus; chocolate starfish (qv). \ No newline at end of file
diff --git a/badurls.txt b/badurls.txt
new file mode 100644
index 0000000..d439c59
--- /dev/null
+++ b/badurls.txt
@@ -0,0 +1,6 @@
+twitter.com
+https://x.com
+https://www.reddit.com
+imgurl.com
+i.redd.it
+troyhunt.com
diff --git a/lobsterstags.txt b/lobsterstags.txt
new file mode 100644
index 0000000..ad81f9a
--- /dev/null
+++ b/lobsterstags.txt
@@ -0,0 +1,9 @@
+security:8
+virtualization:9
+reversing:5
+privacy:9
+openbsd:14
+cryptography:6
+osdev:6
+networking:13
+compsci:16
diff --git a/minlobsters.py b/minlobsters.py
new file mode 100644
index 0000000..e383535
--- /dev/null
+++ b/minlobsters.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+
+import sys, getopt
+import requests
+
+from selectolax.parser import HTMLParser
+
+def extract_lobsters(tag):
+ base_url = f"https://lobste.rs/t/{tag}"
+ r = requests.get(base_url, headers = {'User-agent': 'yourbot'})
+
+ h = HTMLParser(r.text)
+ items = h.css('ol.stories.list li') # Our parent CSS class, each article is in an li
+ for i in items:
+ score = i.css("div.score")[0].text() # .text() recovers the text between tags
+ title = i.css("span.link a")[0].text()
+ url = i.css("span.link a")[0].attrs['href'] # .attrs recovers attribute content (ie href here)
+
+ print(f"\"{title}\" - {url} ({score})")
+
+tag = "reversing"
+extract_lobsters(tag)
diff --git a/minreddit.py b/minreddit.py
new file mode 100644
index 0000000..73ddd41
--- /dev/null
+++ b/minreddit.py
@@ -0,0 +1,18 @@
+#!/usr/bin/env python3
+
+import sys, getopt
+import requests
+
+def extract_reddit(tag):
+ base_url = f"https://www.reddit.com/r/{sub}.json"
+ r = requests.get(base_url, headers = {'User-agent': 'yourbot'})
+ data = r.json()['data']['children']
+
+ for i in data:
+ d = i['data']
+ title, url, score = d['title'], d['url'], d['score']
+
+ print(f"\"{title}\" - {url} ({score})")
+
+sub = "blueteamsec"
+extract_reddit(sub)
diff --git a/redditsubs.txt b/redditsubs.txt
new file mode 100644
index 0000000..61b7109
--- /dev/null
+++ b/redditsubs.txt
@@ -0,0 +1,5 @@
+ReverseEngineering:1
+redteamsec:8
+blueteamsec:12
+netsec:30
+cybersecurity:100
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..e64a534
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,7 @@
+certifi==2024.2.2
+charset-normalizer==3.3.2
+diskcache==5.6.3
+idna==3.7
+requests==2.31.0
+selectolax==0.3.21
+urllib3==2.2.1
diff --git a/spider.py b/spider.py
new file mode 100644
index 0000000..0474c41
--- /dev/null
+++ b/spider.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python3
+
+import sys, getopt
+import requests
+
+from diskcache import Index
+from selectolax.parser import HTMLParser
+
+DEBUG = False
+USAGE = f"Usage: python {sys.argv[0]} [--help] | [--version] | [--reddit] | [--lobsters]"
+VERSION = f"{sys.argv[0]} version 1.0.0"
+
+subs = {}
+tags = {}
+limit = 100
+timeframe = 'week' #hour, day, week, month, year, all
+listing = 'top' # controversial, best, hot, new, random, rising, top
+badurls = []
+posts = Index('data/results')
+newposts = {}
+reddit = False
+lobsters = False
+
+def parse():
+ config = {}
+ options, arguments = getopt.getopt(
+ sys.argv[1:], # Arguments
+ 'vrlh', # Short option definitions
+ ["version", "reddit", "lobsters", "help"]) # Long option definitions
+ separator = "\n"
+ for o, a in options:
+ if o in ("-v", "--version"):
+ print(VERSION)
+ sys.exit()
+ if o in ("-r", "--reddit"):
+ config['reddit'] = True
+ if o in ("-l", "--lobsters"):
+ config['lobsters'] = True
+ if o in ("-h", "--help"):
+ print(USAGE)
+ sys.exit()
+ if not options:
+ raise SystemExit(USAGE)
+ try:
+ operands = [int(arg) for arg in arguments]
+ except ValueError:
+ raise SystemExit(USAGE)
+ return separator, operands, config
+
+
+def get_reddit(subreddit,listing,limit,timeframe):
+ try:
+ base_url = f'https://www.reddit.com/r/{subreddit}/{listing}.json?limit={limit}&t={timeframe}'
+ request = requests.get(base_url, headers = {'User-agent': 'yourbot'})
+ except:
+ print('An Error Occured')
+ return request.json()
+
+def process_reddit(sub, data, min_score):
+ for i in data:
+ d = i['data']
+ if (d['score'] >= min_score):
+ title, url, score = d['title'], d['url'], d['score']
+
+ butest = [s for s in badurls if s.lower() in url.lower()]
+ if butest:
+ if DEBUG:
+ print(f"{url} is in badurls {butest}")
+ else:
+ post = {'title': title, 'source': f"reddit:{sub}", 'url': url, 'score': score}
+ if not url in posts:
+ newposts[url] = post
+
+ if newposts:
+ for k, p in newposts.items():
+ posts[p['url']] = p
+
+def extract_reddits(subs):
+ for sub, min_score in subs.items():
+ r = get_reddit(sub, listing, limit, timeframe)
+ data = r['data']['children']
+ process_reddit(sub, data, min_score)
+
+def extract_lobsters(tags):
+ taglist = ",".join(tags.keys())
+ try:
+ base_url = f"https://lobste.rs/t/{taglist}"
+ r = requests.get(base_url, headers = {'User-agent': 'yourbot'})
+ except:
+ print('An Error extracting from lobste.rs occured')
+ sys.exit(-1)
+
+ h = HTMLParser(r.text)
+ items = h.css('ol.stories.list li')
+ for i in items:
+ score = i.css("div.score")[0].text()
+ title = i.css("span.link a")[0].text()
+ url = i.css("span.link a")[0].attrs['href']
+ tags = ", ".join([x.text() for x in i.css("span.tags a")])
+ post = {'title': title, 'source': f"lobsters:{tags}", 'url': url, 'score': score}
+ if not url in posts:
+ newposts[url] = post
+
+ if newposts:
+ for k, p in newposts.items():
+ posts[p['url']] = p
+
+
+def dump_data():
+ if DEBUG:
+ print("Subs identified:")
+ for k,v in subs.items():
+ print(f"Sub: {k} - min_score: {v}")
+
+ print("\n")
+
+ for k,p in newposts.items():
+ print(f"Src: \"{p['source']}\" Title: \"{p['title']}\" Url: {p['url']} Score: {p['score']}")
+
+# main
+separator, operands, config = parse()
+
+with open("badurls.txt") as f:
+ badurls = f.read().splitlines()
+
+if config.get("reddit"):
+ with open("redditsubs.txt") as f:
+ for l in f:
+ sub,min_score = l.partition(":")[::2]
+ subs[sub.strip()] = int(min_score)
+ extract_reddits(subs)
+
+if config.get("lobsters"):
+ with open("lobsterstags.txt") as f:
+ for l in f:
+ tag,min_score = l.partition(":")[::2]
+ tags[tag.strip()] = int(min_score)
+ extract_lobsters(tags)
+
+dump_data()