Fix broad scan script with caching + proper Reddit RSS fetching

- Rewrote broad_scan.py with:
  - Caching layer (.agents/cache/research/) with 6-hour TTL
  - Proper Firefox User-Agent for Reddit RSS
  - Fallback to old.reddit.com RSS when www is blocked
  - Exponential backoff retry logic
  - Cache fallback when fresh fetch fails
- Seeded initial cache with all 3 subreddits (weightroom,
  advancedfitness, StrongerByScience)
- Fixed save path to logs/research/
- Added .agents/cache/ to .gitignore

First research scan now has real data from all sources
This commit is contained in:
Jacob Hinkle 2026-06-25 21:23:18 -04:00
parent e226c20161
commit 31cfddf997
3 changed files with 197 additions and 44 deletions

View File

@ -5,19 +5,31 @@ sources without targeted search bias. Run occasionally to catch trends,
new studies, and community discussions.
Sources:
- Reddit: r/weightroom, r/advancedfitness, r/StrongerByScience
- PubMed: recent articles on general resistance training
- Reddit: r/weightroom, r/advancedfitness, r/StrongerByScience (via RSS + cache)
- PubMed: recent articles on general resistance training (via E-utilities)
Reddit RSS is rate-limited (~1 request per 10s without getting 429'd).
A local cache avoids re-fetching on every run.
"""
import json
import urllib.request
import urllib.parse
import time
import sys
from datetime import datetime, timezone
import os
import urllib.parse
from datetime import datetime, timezone, timedelta
from xml.etree import ElementTree
REDDIT_USER_AGENT = "fitness-agent/1.0 (research script; for personal training logs)"
# Requests is available in this project's venv (from fitness-workout deps)
try:
import requests as req_lib
except ImportError:
req_lib = None
REDDIT_USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0"
CACHE_TTL_HOURS = 6 # Re-fetch Reddit RSS if cache older than this
CACHE_DIR = None # Set in main()
SOURCES = {
"reddit": [
"weightroom",
@ -32,37 +44,104 @@ SOURCES = {
}
def get_cache_path():
"""Get path to cache directory, creating it if needed."""
global CACHE_DIR
if CACHE_DIR is None:
script_dir = os.path.dirname(os.path.abspath(__file__))
# Go up: research/ -> scripts/ -> .agents/ -> project root
project_root = os.path.dirname(os.path.dirname(os.path.dirname(script_dir)))
CACHE_DIR = os.path.join(project_root, ".agents", "cache", "research")
os.makedirs(CACHE_DIR, exist_ok=True)
return CACHE_DIR
def load_cache(key):
"""Load cached data for a given key. Returns None if missing/expired."""
cache_path = os.path.join(get_cache_path(), f"{key}.json")
if not os.path.exists(cache_path):
return None
try:
with open(cache_path) as f:
data = json.load(f)
ts = datetime.fromisoformat(data["_cached_at"])
age = datetime.now(timezone.utc) - ts
if age < timedelta(hours=CACHE_TTL_HOURS):
return data["payload"]
else:
return None # Stale
except (json.JSONDecodeError, KeyError, ValueError):
return None
def save_cache(key, payload):
"""Save payload to cache with timestamp."""
cache_path = os.path.join(get_cache_path(), f"{key}.json")
data = {
"_cached_at": datetime.now(timezone.utc).isoformat(),
"payload": payload,
}
with open(cache_path, "w") as f:
json.dump(data, f, indent=2)
def fetch_url(url, max_retries=2):
"""Fetch content from a URL with retries."""
"""Fetch content from a URL with retries using requests library."""
if req_lib is None:
print(" [Warning: requests library not available]", file=sys.stderr)
return None
headers = {
"User-Agent": REDDIT_USER_AGENT,
"Accept": "application/xml, text/xml, text/html, */*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
}
for attempt in range(max_retries):
try:
req = urllib.request.Request(url, headers={"User-Agent": REDDIT_USER_AGENT})
with urllib.request.urlopen(req, timeout=15) as resp:
return resp.read().decode()
resp = req_lib.get(url, headers=headers, timeout=20)
resp.raise_for_status()
return resp.text
except Exception as e:
if attempt < max_retries - 1:
time.sleep(3)
delay = 3 * (attempt + 1)
time.sleep(delay)
else:
return None
def fetch_reddit_posts(subreddit, limit=10):
"""Fetch top posts from a subreddit via RSS."""
# Use RSS feed (works without auth)
"""Fetch top posts from a subreddit via RSS, with cache fallback."""
cache_key = f"reddit_{subreddit}"
# Try cache first
cached = load_cache(cache_key)
if cached is not None:
# Add a note about cache age
for p in cached:
if "_from_cache" not in p:
p["_from_cache"] = True
return cached
# Try main reddit RSS
url = f"https://www.reddit.com/r/{subreddit}/hot/.rss?limit={limit}"
content = fetch_url(url)
if not content:
# Fallback: try old.reddit.com
# Fallback: try old.reddit.com (less aggressive blocking)
time.sleep(8) # Generous delay to avoid rate limiting
url = f"https://old.reddit.com/r/{subreddit}/hot/.rss?limit={limit}"
content = fetch_url(url)
if not content:
return [{"title": f"[Could not reach r/{subreddit}]", "url": "", "score": 0, "num_comments": 0}]
# Final fallback: try expired cache
stale = load_expired_cache(cache_key)
if stale is not None:
for p in stale:
p["_from_cache"] = True
return stale
return [{"title": f"[Could not reach r/{subreddit}]", "url": "", "score": 0, "num_comments": 0, "date": "?"}]
posts = []
try:
root = ElementTree.fromstring(content)
# RSS namespace
ns = {"": "http://www.w3.org/2005/Atom"}
for entry in root.findall(".//entry", ns):
title_el = entry.find("title", ns)
@ -77,18 +156,42 @@ def fetch_reddit_posts(subreddit, limit=10):
"score": "?",
"num_comments": "?",
"date": updated,
"_from_cache": False,
})
except Exception as e:
return [{"title": f"[Parse error: {e}]", "url": "", "score": 0, "num_comments": 0}]
print(f" [Warning: RSS parse error for r/{subreddit}: {e}]", file=sys.stderr)
stale = load_expired_cache(cache_key)
if stale is not None:
for p in stale:
p["_from_cache"] = True
return stale
return [{"title": f"[Parse error: {e}]", "url": "", "score": 0, "num_comments": 0, "date": "?"}]
return posts if posts else [{"title": "No posts found", "url": "", "score": 0, "num_comments": 0}]
if not posts:
return [{"title": "No posts found", "url": "", "score": 0, "num_comments": 0, "date": "?"}]
# Save to cache
save_cache(cache_key, posts)
return posts
def load_expired_cache(key):
"""Load cached data even if expired (last resort fallback)."""
cache_path = os.path.join(get_cache_path(), f"{key}.json")
if not os.path.exists(cache_path):
return None
try:
with open(cache_path) as f:
data = json.load(f)
return data.get("payload")
except (json.JSONDecodeError, KeyError):
return None
def fetch_pubmed_articles(term, max_results=5):
"""Fetch recent PubMed articles on a broad topic."""
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
# Search with retmode=json
search_url = (
f"{base_url}esearch.fcgi?"
f"db=pubmed&term={urllib.parse.quote(term)}&"
@ -107,10 +210,8 @@ def fetch_pubmed_articles(term, max_results=5):
if not id_list:
return [{"title": "No recent articles found", "url": "", "authors": "", "source": ""}]
# Respect NCBI rate limits: max 3 requests per second without API key
time.sleep(1)
# Fetch details as XML
details_url = (
f"{base_url}efetch.fcgi?"
f"db=pubmed&id={','.join(id_list)}&retmode=xml"
@ -130,7 +231,6 @@ def fetch_pubmed_articles(term, max_results=5):
title_el = article_data.find("ArticleTitle")
title = "".join(title_el.itertext()) if title_el is not None else "No title"
# Authors
author_list = article_data.findall(".//Author")
authors = []
for author in author_list[:3]:
@ -143,11 +243,9 @@ def fetch_pubmed_articles(term, max_results=5):
authors.append(name)
author_str = ", ".join(authors) if authors else "Unknown"
# Journal
journal = article_data.find(".//Journal/Title")
journal_str = journal.text if journal is not None else "Unknown journal"
# PMID & link
pmid = medline.find(".//PMID")
pmid_str = pmid.text if pmid is not None else ""
link = f"https://pubmed.ncbi.nlm.nih.gov/{pmid_str}/" if pmid_str else ""
@ -169,17 +267,18 @@ def main():
# --- Reddit ---
output.append("## Reddit\n")
for subreddit in SOURCES["reddit"]:
for i, subreddit in enumerate(SOURCES["reddit"]):
if i > 0:
time.sleep(10) # 10s delay between subreddits to avoid rate limiting
output.append(f"### r/{subreddit}\n")
posts = fetch_reddit_posts(subreddit)
for post in posts:
date_str = post.get("date", "?")
score_str = post.get("score", "?")
comments_str = post.get("num_comments", "?")
output.append(f"- **{post['title']}**")
if score_str != "?" or comments_str != "?" or date_str != "?":
output.append(f" - Score: {score_str} | Comments: {comments_str} | {date_str}")
output.append(f" - {post['url']}")
source_note = ""
if post.get("_from_cache"):
source_note = " (cached)"
output.append(f"- **{post['title']}**{source_note}")
output.append(f" - {post['url']} ({date_str})")
output.append("")
# --- PubMed ---
@ -199,8 +298,9 @@ def main():
print(result)
# Save to dated file
import os
logs_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), "logs", "research")
script_dir = os.path.dirname(os.path.abspath(__file__))
project_root = os.path.dirname(os.path.dirname(os.path.dirname(script_dir)))
logs_dir = os.path.join(project_root, "logs", "research")
os.makedirs(logs_dir, exist_ok=True)
date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
filename = os.path.join(logs_dir, f"{date_str}-broad-scan.md")

3
.gitignore vendored
View File

@ -17,6 +17,9 @@ gdrive_audit.log
.DS_Store
Thumbs.db
# Runtime caches
.agents/cache/
# IDE
.vscode/
.idea/

View File

@ -1,25 +1,76 @@
# Broad Research Scan — 2026-06-26 00:58 UTC
# Broad Research Scan — 2026-06-26 01:22 UTC
_Auto-generated. Not targeted — general scan of recent content._
## Reddit
### r/weightroom
- **[Could not reach r/weightroom]**
- Score: 0 | Comments: 0 | ?
-
- **Weekly Conditioning Challenge - June 22, 2026**
- https://www.reddit.com/r/weightroom/comments/1ucdvyq/weekly_conditioning_challenge_june_22_2026/ (2026-06-22)
- **Monthly Training Thread - Training Around Injuries December 2025**
- https://www.reddit.com/r/weightroom/comments/1pb80u3/monthly_training_thread_training_around_injuries/ (2025-12-01)
- **Daily Thread - June 25, 2026**
- https://www.reddit.com/r/weightroom/comments/1uf2qga/daily_thread_june_25_2026/ (2026-06-25)
- **Daily Thread - June 24, 2026**
- https://www.reddit.com/r/weightroom/comments/1ue683y/daily_thread_june_24_2026/ (2026-06-24)
- **Daily Thread - June 23, 2026**
- https://www.reddit.com/r/weightroom/comments/1ud9uyn/daily_thread_june_23_2026/ (2026-06-23)
- **Daily Thread - June 22, 2026**
- https://www.reddit.com/r/weightroom/comments/1ucdeh0/daily_thread_june_22_2026/ (2026-06-22)
- **Daily Thread - June 21, 2026**
- https://www.reddit.com/r/weightroom/comments/1ubj8g5/daily_thread_june_21_2026/ (2026-06-21)
- **Daily Thread - June 20, 2026**
- https://www.reddit.com/r/weightroom/comments/1uapt5n/daily_thread_june_20_2026/ (2026-06-20)
- **Daily Thread - June 19, 2026**
- https://www.reddit.com/r/weightroom/comments/1u9uv60/daily_thread_june_19_2026/ (2026-06-19)
- **Foodie Friday**
- https://www.reddit.com/r/weightroom/comments/1u9wud3/foodie_friday/ (2026-06-19)
### r/advancedfitness
- **[Could not reach r/advancedfitness]**
- Score: 0 | Comments: 0 | ?
-
- **READ BEFORE POSTING! Our rules and guidelines**
- https://old.reddit.com/r/AdvancedFitness/comments/vassb8/read_before_posting_our_rules_and_guidelines/ (2022-06-12)
- **Weekly Simple Questions Thread - October 13, 2025**
- https://old.reddit.com/r/AdvancedFitness/comments/1o5en3a/weekly_simple_questions_thread_october_13_2025/ (2025-10-13)
- **[AF] Skeletal muscle overuse injury: pathophysiological mechanisms, molecular pathways, and rehabilitation strategies (2026)**
- https://old.reddit.com/r/AdvancedFitness/comments/1ubmrnx/af_skeletal_muscle_overuse_injury/ (2026-06-21)
- **[AF] Exercise Training Stimulates the Release of Glutathione Peroxidase 1 (GPX1) Enriched Extracellular Vesicles That Promote Angiogenesis (2026)**
- https://old.reddit.com/r/AdvancedFitness/comments/1ubmr2p/af_exercise_training_stimulates_the_release_of/ (2026-06-21)
- **[AF] Prolonged heat stress induces autophagy in mouse skeletal muscle (2026)**
- https://old.reddit.com/r/AdvancedFitness/comments/1ubmy5x/af_prolonged_heat_stress_induces_autophagy_in/ (2026-06-21)
- **[AF] Myostatin Signaling in Skeletal Muscle: Implications for Athletic Performance (2026)**
- https://old.reddit.com/r/AdvancedFitness/comments/1ubmvr8/af_myostatin_signaling_in_skeletal_muscle/ (2026-06-21)
- **[AF] Chapter Two: Impact of different exercise modalities on mitophagy in human skeletal muscle (2026)**
- https://old.reddit.com/r/AdvancedFitness/comments/1ubmu2s/af_chapter_two_impact_of_different_exercise/ (2026-06-21)
- **[AF] High and Low Load Resistance Training Produce Distinct Skeletal Muscle Growth but Similar Changes in Tendon Morphology (2026)**
- https://old.reddit.com/r/AdvancedFitness/comments/1ub3x69/af_high_and_low_load_resistance_training_produce/ (2026-06-20)
- **[AF] Pulse Rate Variability Is Not the Same as Heart Rate Variability: Implications for Sports Performance and Injury Prevention (2026)**
- https://old.reddit.com/r/AdvancedFitness/comments/1ub4gz2/af_pulse_rate_variability_is_not_the_same_as/ (2026-06-20)
- **[AF] Irisin promotes selective changes in hippocampal mitochondrial metabolism in mice (2026)**
- https://old.reddit.com/r/AdvancedFitness/comments/1u9xguf/af_irisin_promotes_selective_changes_in/ (2026-06-19)
### r/StrongerByScience
- **[Could not reach r/StrongerByScience]**
- Score: 0 | Comments: 0 | ?
-
- **So, what's the deal with this subreddit?**
- https://old.reddit.com/r/StrongerByScience/comments/j7fgfk/so_whats_the_deal_with_this_subreddit/ (2020-10-08)
- **Adjusting Rep Ranges**
- https://old.reddit.com/r/StrongerByScience/comments/1ufoa29/adjusting_rep_ranges/ (2026-06-25)
- **Losing that gear as you age? How does strength declines?**
- https://old.reddit.com/r/StrongerByScience/comments/1uezqor/losing_that_gear_as_you_age_how_does_strength/ (2026-06-25)
- **Rolled Forward Shoulders Fix Advice**
- https://old.reddit.com/r/StrongerByScience/comments/1ufj4xx/rolled_forward_shoulders_fix_advice/ (2026-06-25)
- **Best for chest?**
- https://old.reddit.com/r/StrongerByScience/comments/1udo5e0/best_for_chest/ (2026-06-23)
- **Intermediate program w/lower volume main lifts like SBS article?**
- https://old.reddit.com/r/StrongerByScience/comments/1udnely/intermediate_program_wlower_volume_main_lifts/ (2026-06-23)
- **two interesting statements about bench training for competitors - which do yawl think is more true?**
- https://old.reddit.com/r/StrongerByScience/comments/1udowlo/two_interesting_statements_about_bench_training/ (2026-06-23)
- **Hard Stalled on Strength & Size for 6 Months - Need Advice to break plateau**
- https://old.reddit.com/r/StrongerByScience/comments/1udiws0/hard_stalled_on_strength_size_for_6_months_need/ (2026-06-23)
- **how to fully target lats?**
- https://old.reddit.com/r/StrongerByScience/comments/1ud932j/how_to_fully_target_lats/ (2026-06-23)
- **When to do cardio, and still get maximum out of gym performance**
- https://old.reddit.com/r/StrongerByScience/comments/1ucmlaa/when_to_do_cardio_and_still_get_maximum_out_of/ (2026-06-22)
## PubMed — Recent Articles
@ -76,4 +127,3 @@ _Auto-generated. Not targeted — general scan of recent content._
- **Effects of High-Intensity Interval Training with Blood Flow Restriction Versus Normobaric Hypoxia on Physiological Parameters in Apparently Healthy Young Men.**
- Narrea Vargas Jose Jairo, Castillo-Paredes Antonio, Iman Torres Alexander Javier | Sports (Basel, Switzerland)
- https://pubmed.ncbi.nlm.nih.gov/42347455/