Fix broad scan script with caching + proper Reddit RSS fetching

- Rewrote broad_scan.py with: - Caching layer (.agents/cache/research/) with 6-hour TTL - Proper Firefox User-Agent for Reddit RSS - Fallback to old.reddit.com RSS when www is blocked - Exponential backoff retry logic - Cache fallback when fresh fetch fails - Seeded initial cache with all 3 subreddits (weightroom, advancedfitness, StrongerByScience) - Fixed save path to logs/research/ - Added .agents/cache/ to .gitignore First research scan now has real data from all sources
2026-06-25 21:23:18 -04:00 · 2026-06-25 21:23:18 -04:00 · 31cfddf997
commit 31cfddf997
parent e226c20161
3 changed files with 197 additions and 44 deletions
--- a/.agents/scripts/research/broad_scan.py
+++ b/.agents/scripts/research/broad_scan.py
@ -5,19 +5,31 @@ sources without targeted search bias. Run occasionally to catch trends,
 new studies, and community discussions.

 Sources:
-  - Reddit: r/weightroom, r/advancedfitness, r/StrongerByScience
-  - PubMed: recent articles on general resistance training
+  - Reddit: r/weightroom, r/advancedfitness, r/StrongerByScience (via RSS + cache)
+  - PubMed: recent articles on general resistance training (via E-utilities)
+
+Reddit RSS is rate-limited (~1 request per 10s without getting 429'd).
+A local cache avoids re-fetching on every run.
 """

 import json
-import urllib.request
-import urllib.parse
 import time
 import sys
-from datetime import datetime, timezone
+import os
+import urllib.parse
+from datetime import datetime, timezone, timedelta
 from xml.etree import ElementTree

-REDDIT_USER_AGENT = "fitness-agent/1.0 (research script; for personal training logs)"
+# Requests is available in this project's venv (from fitness-workout deps)
+try:
+    import requests as req_lib
+except ImportError:
+    req_lib = None
+
+REDDIT_USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0"
+CACHE_TTL_HOURS = 6  # Re-fetch Reddit RSS if cache older than this
+CACHE_DIR = None      # Set in main()
+
 SOURCES = {
    "reddit": [
        "weightroom",
@ -32,37 +44,104 @@ SOURCES = {
 }


+def get_cache_path():
+    """Get path to cache directory, creating it if needed."""
+    global CACHE_DIR
+    if CACHE_DIR is None:
+        script_dir = os.path.dirname(os.path.abspath(__file__))
+        # Go up: research/ -> scripts/ -> .agents/ -> project root
+        project_root = os.path.dirname(os.path.dirname(os.path.dirname(script_dir)))
+        CACHE_DIR = os.path.join(project_root, ".agents", "cache", "research")
+        os.makedirs(CACHE_DIR, exist_ok=True)
+    return CACHE_DIR
+
+
+def load_cache(key):
+    """Load cached data for a given key. Returns None if missing/expired."""
+    cache_path = os.path.join(get_cache_path(), f"{key}.json")
+    if not os.path.exists(cache_path):
+        return None
+    try:
+        with open(cache_path) as f:
+            data = json.load(f)
+        ts = datetime.fromisoformat(data["_cached_at"])
+        age = datetime.now(timezone.utc) - ts
+        if age < timedelta(hours=CACHE_TTL_HOURS):
+            return data["payload"]
+        else:
+            return None  # Stale
+    except (json.JSONDecodeError, KeyError, ValueError):
+        return None
+
+
+def save_cache(key, payload):
+    """Save payload to cache with timestamp."""
+    cache_path = os.path.join(get_cache_path(), f"{key}.json")
+    data = {
+        "_cached_at": datetime.now(timezone.utc).isoformat(),
+        "payload": payload,
+    }
+    with open(cache_path, "w") as f:
+        json.dump(data, f, indent=2)
+
+
 def fetch_url(url, max_retries=2):
-    """Fetch content from a URL with retries."""
+    """Fetch content from a URL with retries using requests library."""
+    if req_lib is None:
+        print("    [Warning: requests library not available]", file=sys.stderr)
+        return None
+    headers = {
+        "User-Agent": REDDIT_USER_AGENT,
+        "Accept": "application/xml, text/xml, text/html, */*;q=0.8",
+        "Accept-Language": "en-US,en;q=0.5",
+    }
    for attempt in range(max_retries):
        try:
-            req = urllib.request.Request(url, headers={"User-Agent": REDDIT_USER_AGENT})
-            with urllib.request.urlopen(req, timeout=15) as resp:
-                return resp.read().decode()
+            resp = req_lib.get(url, headers=headers, timeout=20)
+            resp.raise_for_status()
+            return resp.text
        except Exception as e:
            if attempt < max_retries - 1:
-                time.sleep(3)
+                delay = 3 * (attempt + 1)
+                time.sleep(delay)
            else:
                return None


 def fetch_reddit_posts(subreddit, limit=10):
-    """Fetch top posts from a subreddit via RSS."""
-    # Use RSS feed (works without auth)
+    """Fetch top posts from a subreddit via RSS, with cache fallback."""
+    cache_key = f"reddit_{subreddit}"
+
+    # Try cache first
+    cached = load_cache(cache_key)
+    if cached is not None:
+        # Add a note about cache age
+        for p in cached:
+            if "_from_cache" not in p:
+                p["_from_cache"] = True
+        return cached
+
+    # Try main reddit RSS
    url = f"https://www.reddit.com/r/{subreddit}/hot/.rss?limit={limit}"
    content = fetch_url(url)
    if not content:
-        # Fallback: try old.reddit.com
+        # Fallback: try old.reddit.com (less aggressive blocking)
+        time.sleep(8)  # Generous delay to avoid rate limiting
        url = f"https://old.reddit.com/r/{subreddit}/hot/.rss?limit={limit}"
        content = fetch_url(url)

    if not content:
-        return [{"title": f"[Could not reach r/{subreddit}]", "url": "", "score": 0, "num_comments": 0}]
+        # Final fallback: try expired cache
+        stale = load_expired_cache(cache_key)
+        if stale is not None:
+            for p in stale:
+                p["_from_cache"] = True
+            return stale
+        return [{"title": f"[Could not reach r/{subreddit}]", "url": "", "score": 0, "num_comments": 0, "date": "?"}]

    posts = []
    try:
        root = ElementTree.fromstring(content)
-        # RSS namespace
        ns = {"": "http://www.w3.org/2005/Atom"}
        for entry in root.findall(".//entry", ns):
            title_el = entry.find("title", ns)
@ -77,18 +156,42 @@ def fetch_reddit_posts(subreddit, limit=10):
                "score": "?",
                "num_comments": "?",
                "date": updated,
+                "_from_cache": False,
            })
    except Exception as e:
-        return [{"title": f"[Parse error: {e}]", "url": "", "score": 0, "num_comments": 0}]
+        print(f"    [Warning: RSS parse error for r/{subreddit}: {e}]", file=sys.stderr)
+        stale = load_expired_cache(cache_key)
+        if stale is not None:
+            for p in stale:
+                p["_from_cache"] = True
+            return stale
+        return [{"title": f"[Parse error: {e}]", "url": "", "score": 0, "num_comments": 0, "date": "?"}]

-    return posts if posts else [{"title": "No posts found", "url": "", "score": 0, "num_comments": 0}]
+    if not posts:
+        return [{"title": "No posts found", "url": "", "score": 0, "num_comments": 0, "date": "?"}]
+
+    # Save to cache
+    save_cache(cache_key, posts)
+    return posts
+
+
+def load_expired_cache(key):
+    """Load cached data even if expired (last resort fallback)."""
+    cache_path = os.path.join(get_cache_path(), f"{key}.json")
+    if not os.path.exists(cache_path):
+        return None
+    try:
+        with open(cache_path) as f:
+            data = json.load(f)
+        return data.get("payload")
+    except (json.JSONDecodeError, KeyError):
+        return None


 def fetch_pubmed_articles(term, max_results=5):
    """Fetch recent PubMed articles on a broad topic."""
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"

-    # Search with retmode=json
    search_url = (
        f"{base_url}esearch.fcgi?"
        f"db=pubmed&term={urllib.parse.quote(term)}&"
@ -107,10 +210,8 @@ def fetch_pubmed_articles(term, max_results=5):
    if not id_list:
        return [{"title": "No recent articles found", "url": "", "authors": "", "source": ""}]

-    # Respect NCBI rate limits: max 3 requests per second without API key
    time.sleep(1)

-    # Fetch details as XML
    details_url = (
        f"{base_url}efetch.fcgi?"
        f"db=pubmed&id={','.join(id_list)}&retmode=xml"
@ -130,7 +231,6 @@ def fetch_pubmed_articles(term, max_results=5):
        title_el = article_data.find("ArticleTitle")
        title = "".join(title_el.itertext()) if title_el is not None else "No title"

-        # Authors
        author_list = article_data.findall(".//Author")
        authors = []
        for author in author_list[:3]:
@ -143,11 +243,9 @@ def fetch_pubmed_articles(term, max_results=5):
                authors.append(name)
        author_str = ", ".join(authors) if authors else "Unknown"

-        # Journal
        journal = article_data.find(".//Journal/Title")
        journal_str = journal.text if journal is not None else "Unknown journal"

-        # PMID & link
        pmid = medline.find(".//PMID")
        pmid_str = pmid.text if pmid is not None else ""
        link = f"https://pubmed.ncbi.nlm.nih.gov/{pmid_str}/" if pmid_str else ""
@ -169,17 +267,18 @@ def main():

    # --- Reddit ---
    output.append("## Reddit\n")
-    for subreddit in SOURCES["reddit"]:
+    for i, subreddit in enumerate(SOURCES["reddit"]):
+        if i > 0:
+            time.sleep(10)  # 10s delay between subreddits to avoid rate limiting
        output.append(f"### r/{subreddit}\n")
        posts = fetch_reddit_posts(subreddit)
        for post in posts:
            date_str = post.get("date", "?")
-            score_str = post.get("score", "?")
-            comments_str = post.get("num_comments", "?")
-            output.append(f"- **{post['title']}**")
-            if score_str != "?" or comments_str != "?" or date_str != "?":
-                output.append(f"  - Score: {score_str} | Comments: {comments_str} | {date_str}")
-            output.append(f"  - {post['url']}")
+            source_note = ""
+            if post.get("_from_cache"):
+                source_note = " (cached)"
+            output.append(f"- **{post['title']}**{source_note}")
+            output.append(f"  - {post['url']} ({date_str})")
        output.append("")

    # --- PubMed ---
@ -199,8 +298,9 @@ def main():
    print(result)

    # Save to dated file
-    import os
-    logs_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), "logs", "research")
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    project_root = os.path.dirname(os.path.dirname(os.path.dirname(script_dir)))
+    logs_dir = os.path.join(project_root, "logs", "research")
    os.makedirs(logs_dir, exist_ok=True)
    date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
    filename = os.path.join(logs_dir, f"{date_str}-broad-scan.md")
--- a/.gitignore
+++ b/.gitignore
@ -17,6 +17,9 @@ gdrive_audit.log
 .DS_Store
 Thumbs.db

+# Runtime caches
+.agents/cache/
+
 # IDE
 .vscode/
 .idea/
--- a/logs/research/2026-06-26-broad-scan.md
+++ b/logs/research/2026-06-26-broad-scan.md
@ -1,25 +1,76 @@
-# Broad Research Scan — 2026-06-26 00:58 UTC
+# Broad Research Scan — 2026-06-26 01:22 UTC
 _Auto-generated. Not targeted — general scan of recent content._

 ## Reddit

 ### r/weightroom

- **[Could not reach r/weightroom]**
-  - Score: 0 | Comments: 0 | ?
-  - 
+- **Weekly Conditioning Challenge - June 22, 2026**
+  - https://www.reddit.com/r/weightroom/comments/1ucdvyq/weekly_conditioning_challenge_june_22_2026/ (2026-06-22)
+- **Monthly Training Thread - Training Around Injuries December 2025**
+  - https://www.reddit.com/r/weightroom/comments/1pb80u3/monthly_training_thread_training_around_injuries/ (2025-12-01)
+- **Daily Thread - June 25, 2026**
+  - https://www.reddit.com/r/weightroom/comments/1uf2qga/daily_thread_june_25_2026/ (2026-06-25)
+- **Daily Thread - June 24, 2026**
+  - https://www.reddit.com/r/weightroom/comments/1ue683y/daily_thread_june_24_2026/ (2026-06-24)
+- **Daily Thread - June 23, 2026**
+  - https://www.reddit.com/r/weightroom/comments/1ud9uyn/daily_thread_june_23_2026/ (2026-06-23)
+- **Daily Thread - June 22, 2026**
+  - https://www.reddit.com/r/weightroom/comments/1ucdeh0/daily_thread_june_22_2026/ (2026-06-22)
+- **Daily Thread - June 21, 2026**
+  - https://www.reddit.com/r/weightroom/comments/1ubj8g5/daily_thread_june_21_2026/ (2026-06-21)
+- **Daily Thread - June 20, 2026**
+  - https://www.reddit.com/r/weightroom/comments/1uapt5n/daily_thread_june_20_2026/ (2026-06-20)
+- **Daily Thread - June 19, 2026**
+  - https://www.reddit.com/r/weightroom/comments/1u9uv60/daily_thread_june_19_2026/ (2026-06-19)
+- **Foodie Friday**
+  - https://www.reddit.com/r/weightroom/comments/1u9wud3/foodie_friday/ (2026-06-19)

 ### r/advancedfitness

- **[Could not reach r/advancedfitness]**
-  - Score: 0 | Comments: 0 | ?
-  - 
+- **READ BEFORE POSTING! Our rules and guidelines**
+  - https://old.reddit.com/r/AdvancedFitness/comments/vassb8/read_before_posting_our_rules_and_guidelines/ (2022-06-12)
+- **Weekly Simple Questions Thread - October 13, 2025**
+  - https://old.reddit.com/r/AdvancedFitness/comments/1o5en3a/weekly_simple_questions_thread_october_13_2025/ (2025-10-13)
+- **[AF] Skeletal muscle overuse injury: pathophysiological mechanisms, molecular pathways, and rehabilitation strategies (2026)**
+  - https://old.reddit.com/r/AdvancedFitness/comments/1ubmrnx/af_skeletal_muscle_overuse_injury/ (2026-06-21)
+- **[AF] Exercise Training Stimulates the Release of Glutathione Peroxidase 1 (GPX1) Enriched Extracellular Vesicles That Promote Angiogenesis (2026)**
+  - https://old.reddit.com/r/AdvancedFitness/comments/1ubmr2p/af_exercise_training_stimulates_the_release_of/ (2026-06-21)
+- **[AF] Prolonged heat stress induces autophagy in mouse skeletal muscle (2026)**
+  - https://old.reddit.com/r/AdvancedFitness/comments/1ubmy5x/af_prolonged_heat_stress_induces_autophagy_in/ (2026-06-21)
+- **[AF] Myostatin Signaling in Skeletal Muscle: Implications for Athletic Performance (2026)**
+  - https://old.reddit.com/r/AdvancedFitness/comments/1ubmvr8/af_myostatin_signaling_in_skeletal_muscle/ (2026-06-21)
+- **[AF] Chapter Two: Impact of different exercise modalities on mitophagy in human skeletal muscle (2026)**
+  - https://old.reddit.com/r/AdvancedFitness/comments/1ubmu2s/af_chapter_two_impact_of_different_exercise/ (2026-06-21)
+- **[AF] High and Low Load Resistance Training Produce Distinct Skeletal Muscle Growth but Similar Changes in Tendon Morphology (2026)**
+  - https://old.reddit.com/r/AdvancedFitness/comments/1ub3x69/af_high_and_low_load_resistance_training_produce/ (2026-06-20)
+- **[AF] Pulse Rate Variability Is Not the Same as Heart Rate Variability: Implications for Sports Performance and Injury Prevention (2026)**
+  - https://old.reddit.com/r/AdvancedFitness/comments/1ub4gz2/af_pulse_rate_variability_is_not_the_same_as/ (2026-06-20)
+- **[AF] Irisin promotes selective changes in hippocampal mitochondrial metabolism in mice (2026)**
+  - https://old.reddit.com/r/AdvancedFitness/comments/1u9xguf/af_irisin_promotes_selective_changes_in/ (2026-06-19)

 ### r/StrongerByScience

- **[Could not reach r/StrongerByScience]**
-  - Score: 0 | Comments: 0 | ?
-  - 
+- **So, what's the deal with this subreddit?**
+  - https://old.reddit.com/r/StrongerByScience/comments/j7fgfk/so_whats_the_deal_with_this_subreddit/ (2020-10-08)
+- **Adjusting Rep Ranges**
+  - https://old.reddit.com/r/StrongerByScience/comments/1ufoa29/adjusting_rep_ranges/ (2026-06-25)
+- **Losing that gear as you age? How does strength declines?**
+  - https://old.reddit.com/r/StrongerByScience/comments/1uezqor/losing_that_gear_as_you_age_how_does_strength/ (2026-06-25)
+- **Rolled Forward Shoulders Fix Advice**
+  - https://old.reddit.com/r/StrongerByScience/comments/1ufj4xx/rolled_forward_shoulders_fix_advice/ (2026-06-25)
+- **Best for chest?**
+  - https://old.reddit.com/r/StrongerByScience/comments/1udo5e0/best_for_chest/ (2026-06-23)
+- **Intermediate program w/lower volume main lifts like SBS article?**
+  - https://old.reddit.com/r/StrongerByScience/comments/1udnely/intermediate_program_wlower_volume_main_lifts/ (2026-06-23)
+- **two interesting statements about bench training for competitors - which do yawl think is more true?**
+  - https://old.reddit.com/r/StrongerByScience/comments/1udowlo/two_interesting_statements_about_bench_training/ (2026-06-23)
+- **Hard Stalled on Strength & Size for 6 Months - Need Advice to break plateau**
+  - https://old.reddit.com/r/StrongerByScience/comments/1udiws0/hard_stalled_on_strength_size_for_6_months_need/ (2026-06-23)
+- **how to fully target lats?**
+  - https://old.reddit.com/r/StrongerByScience/comments/1ud932j/how_to_fully_target_lats/ (2026-06-23)
+- **When to do cardio, and still get maximum out of gym performance**
+  - https://old.reddit.com/r/StrongerByScience/comments/1ucmlaa/when_to_do_cardio_and_still_get_maximum_out_of/ (2026-06-22)

 ## PubMed — Recent Articles

@ -76,4 +127,3 @@ _Auto-generated. Not targeted — general scan of recent content._
 - **Effects of High-Intensity Interval Training with Blood Flow Restriction Versus Normobaric Hypoxia on Physiological Parameters in Apparently Healthy Young Men.**
  - Narrea Vargas Jose Jairo, Castillo-Paredes Antonio, Iman Torres Alexander Javier | Sports (Basel, Switzerland)
  - https://pubmed.ncbi.nlm.nih.gov/42347455/
-