From 31cfddf9976124299f6b11500182eccff5983474 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Thu, 25 Jun 2026 21:23:18 -0400 Subject: [PATCH] Fix broad scan script with caching + proper Reddit RSS fetching - Rewrote broad_scan.py with: - Caching layer (.agents/cache/research/) with 6-hour TTL - Proper Firefox User-Agent for Reddit RSS - Fallback to old.reddit.com RSS when www is blocked - Exponential backoff retry logic - Cache fallback when fresh fetch fails - Seeded initial cache with all 3 subreddits (weightroom, advancedfitness, StrongerByScience) - Fixed save path to logs/research/ - Added .agents/cache/ to .gitignore First research scan now has real data from all sources --- .agents/scripts/research/broad_scan.py | 166 ++++++++++++++++++++----- .gitignore | 3 + logs/research/2026-06-26-broad-scan.md | 72 +++++++++-- 3 files changed, 197 insertions(+), 44 deletions(-) diff --git a/.agents/scripts/research/broad_scan.py b/.agents/scripts/research/broad_scan.py index bbe8243..3805021 100644 --- a/.agents/scripts/research/broad_scan.py +++ b/.agents/scripts/research/broad_scan.py @@ -5,19 +5,31 @@ sources without targeted search bias. Run occasionally to catch trends, new studies, and community discussions. Sources: - - Reddit: r/weightroom, r/advancedfitness, r/StrongerByScience - - PubMed: recent articles on general resistance training + - Reddit: r/weightroom, r/advancedfitness, r/StrongerByScience (via RSS + cache) + - PubMed: recent articles on general resistance training (via E-utilities) + +Reddit RSS is rate-limited (~1 request per 10s without getting 429'd). +A local cache avoids re-fetching on every run. """ import json -import urllib.request -import urllib.parse import time import sys -from datetime import datetime, timezone +import os +import urllib.parse +from datetime import datetime, timezone, timedelta from xml.etree import ElementTree -REDDIT_USER_AGENT = "fitness-agent/1.0 (research script; for personal training logs)" +# Requests is available in this project's venv (from fitness-workout deps) +try: + import requests as req_lib +except ImportError: + req_lib = None + +REDDIT_USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0" +CACHE_TTL_HOURS = 6 # Re-fetch Reddit RSS if cache older than this +CACHE_DIR = None # Set in main() + SOURCES = { "reddit": [ "weightroom", @@ -32,37 +44,104 @@ SOURCES = { } +def get_cache_path(): + """Get path to cache directory, creating it if needed.""" + global CACHE_DIR + if CACHE_DIR is None: + script_dir = os.path.dirname(os.path.abspath(__file__)) + # Go up: research/ -> scripts/ -> .agents/ -> project root + project_root = os.path.dirname(os.path.dirname(os.path.dirname(script_dir))) + CACHE_DIR = os.path.join(project_root, ".agents", "cache", "research") + os.makedirs(CACHE_DIR, exist_ok=True) + return CACHE_DIR + + +def load_cache(key): + """Load cached data for a given key. Returns None if missing/expired.""" + cache_path = os.path.join(get_cache_path(), f"{key}.json") + if not os.path.exists(cache_path): + return None + try: + with open(cache_path) as f: + data = json.load(f) + ts = datetime.fromisoformat(data["_cached_at"]) + age = datetime.now(timezone.utc) - ts + if age < timedelta(hours=CACHE_TTL_HOURS): + return data["payload"] + else: + return None # Stale + except (json.JSONDecodeError, KeyError, ValueError): + return None + + +def save_cache(key, payload): + """Save payload to cache with timestamp.""" + cache_path = os.path.join(get_cache_path(), f"{key}.json") + data = { + "_cached_at": datetime.now(timezone.utc).isoformat(), + "payload": payload, + } + with open(cache_path, "w") as f: + json.dump(data, f, indent=2) + + def fetch_url(url, max_retries=2): - """Fetch content from a URL with retries.""" + """Fetch content from a URL with retries using requests library.""" + if req_lib is None: + print(" [Warning: requests library not available]", file=sys.stderr) + return None + headers = { + "User-Agent": REDDIT_USER_AGENT, + "Accept": "application/xml, text/xml, text/html, */*;q=0.8", + "Accept-Language": "en-US,en;q=0.5", + } for attempt in range(max_retries): try: - req = urllib.request.Request(url, headers={"User-Agent": REDDIT_USER_AGENT}) - with urllib.request.urlopen(req, timeout=15) as resp: - return resp.read().decode() + resp = req_lib.get(url, headers=headers, timeout=20) + resp.raise_for_status() + return resp.text except Exception as e: if attempt < max_retries - 1: - time.sleep(3) + delay = 3 * (attempt + 1) + time.sleep(delay) else: return None def fetch_reddit_posts(subreddit, limit=10): - """Fetch top posts from a subreddit via RSS.""" - # Use RSS feed (works without auth) + """Fetch top posts from a subreddit via RSS, with cache fallback.""" + cache_key = f"reddit_{subreddit}" + + # Try cache first + cached = load_cache(cache_key) + if cached is not None: + # Add a note about cache age + for p in cached: + if "_from_cache" not in p: + p["_from_cache"] = True + return cached + + # Try main reddit RSS url = f"https://www.reddit.com/r/{subreddit}/hot/.rss?limit={limit}" content = fetch_url(url) if not content: - # Fallback: try old.reddit.com + # Fallback: try old.reddit.com (less aggressive blocking) + time.sleep(8) # Generous delay to avoid rate limiting url = f"https://old.reddit.com/r/{subreddit}/hot/.rss?limit={limit}" content = fetch_url(url) if not content: - return [{"title": f"[Could not reach r/{subreddit}]", "url": "", "score": 0, "num_comments": 0}] + # Final fallback: try expired cache + stale = load_expired_cache(cache_key) + if stale is not None: + for p in stale: + p["_from_cache"] = True + return stale + return [{"title": f"[Could not reach r/{subreddit}]", "url": "", "score": 0, "num_comments": 0, "date": "?"}] posts = [] try: root = ElementTree.fromstring(content) - # RSS namespace ns = {"": "http://www.w3.org/2005/Atom"} for entry in root.findall(".//entry", ns): title_el = entry.find("title", ns) @@ -77,18 +156,42 @@ def fetch_reddit_posts(subreddit, limit=10): "score": "?", "num_comments": "?", "date": updated, + "_from_cache": False, }) except Exception as e: - return [{"title": f"[Parse error: {e}]", "url": "", "score": 0, "num_comments": 0}] + print(f" [Warning: RSS parse error for r/{subreddit}: {e}]", file=sys.stderr) + stale = load_expired_cache(cache_key) + if stale is not None: + for p in stale: + p["_from_cache"] = True + return stale + return [{"title": f"[Parse error: {e}]", "url": "", "score": 0, "num_comments": 0, "date": "?"}] - return posts if posts else [{"title": "No posts found", "url": "", "score": 0, "num_comments": 0}] + if not posts: + return [{"title": "No posts found", "url": "", "score": 0, "num_comments": 0, "date": "?"}] + + # Save to cache + save_cache(cache_key, posts) + return posts + + +def load_expired_cache(key): + """Load cached data even if expired (last resort fallback).""" + cache_path = os.path.join(get_cache_path(), f"{key}.json") + if not os.path.exists(cache_path): + return None + try: + with open(cache_path) as f: + data = json.load(f) + return data.get("payload") + except (json.JSONDecodeError, KeyError): + return None def fetch_pubmed_articles(term, max_results=5): """Fetch recent PubMed articles on a broad topic.""" base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" - # Search with retmode=json search_url = ( f"{base_url}esearch.fcgi?" f"db=pubmed&term={urllib.parse.quote(term)}&" @@ -107,10 +210,8 @@ def fetch_pubmed_articles(term, max_results=5): if not id_list: return [{"title": "No recent articles found", "url": "", "authors": "", "source": ""}] - # Respect NCBI rate limits: max 3 requests per second without API key time.sleep(1) - # Fetch details as XML details_url = ( f"{base_url}efetch.fcgi?" f"db=pubmed&id={','.join(id_list)}&retmode=xml" @@ -130,7 +231,6 @@ def fetch_pubmed_articles(term, max_results=5): title_el = article_data.find("ArticleTitle") title = "".join(title_el.itertext()) if title_el is not None else "No title" - # Authors author_list = article_data.findall(".//Author") authors = [] for author in author_list[:3]: @@ -143,11 +243,9 @@ def fetch_pubmed_articles(term, max_results=5): authors.append(name) author_str = ", ".join(authors) if authors else "Unknown" - # Journal journal = article_data.find(".//Journal/Title") journal_str = journal.text if journal is not None else "Unknown journal" - # PMID & link pmid = medline.find(".//PMID") pmid_str = pmid.text if pmid is not None else "" link = f"https://pubmed.ncbi.nlm.nih.gov/{pmid_str}/" if pmid_str else "" @@ -169,17 +267,18 @@ def main(): # --- Reddit --- output.append("## Reddit\n") - for subreddit in SOURCES["reddit"]: + for i, subreddit in enumerate(SOURCES["reddit"]): + if i > 0: + time.sleep(10) # 10s delay between subreddits to avoid rate limiting output.append(f"### r/{subreddit}\n") posts = fetch_reddit_posts(subreddit) for post in posts: date_str = post.get("date", "?") - score_str = post.get("score", "?") - comments_str = post.get("num_comments", "?") - output.append(f"- **{post['title']}**") - if score_str != "?" or comments_str != "?" or date_str != "?": - output.append(f" - Score: {score_str} | Comments: {comments_str} | {date_str}") - output.append(f" - {post['url']}") + source_note = "" + if post.get("_from_cache"): + source_note = " (cached)" + output.append(f"- **{post['title']}**{source_note}") + output.append(f" - {post['url']} ({date_str})") output.append("") # --- PubMed --- @@ -199,8 +298,9 @@ def main(): print(result) # Save to dated file - import os - logs_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), "logs", "research") + script_dir = os.path.dirname(os.path.abspath(__file__)) + project_root = os.path.dirname(os.path.dirname(os.path.dirname(script_dir))) + logs_dir = os.path.join(project_root, "logs", "research") os.makedirs(logs_dir, exist_ok=True) date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d") filename = os.path.join(logs_dir, f"{date_str}-broad-scan.md") diff --git a/.gitignore b/.gitignore index 5cc894f..78ce870 100644 --- a/.gitignore +++ b/.gitignore @@ -17,6 +17,9 @@ gdrive_audit.log .DS_Store Thumbs.db +# Runtime caches +.agents/cache/ + # IDE .vscode/ .idea/ diff --git a/logs/research/2026-06-26-broad-scan.md b/logs/research/2026-06-26-broad-scan.md index de17c95..a9df826 100644 --- a/logs/research/2026-06-26-broad-scan.md +++ b/logs/research/2026-06-26-broad-scan.md @@ -1,25 +1,76 @@ -# Broad Research Scan — 2026-06-26 00:58 UTC +# Broad Research Scan — 2026-06-26 01:22 UTC _Auto-generated. Not targeted — general scan of recent content._ ## Reddit ### r/weightroom -- **[Could not reach r/weightroom]** - - Score: 0 | Comments: 0 | ? - - +- **Weekly Conditioning Challenge - June 22, 2026** + - https://www.reddit.com/r/weightroom/comments/1ucdvyq/weekly_conditioning_challenge_june_22_2026/ (2026-06-22) +- **Monthly Training Thread - Training Around Injuries December 2025** + - https://www.reddit.com/r/weightroom/comments/1pb80u3/monthly_training_thread_training_around_injuries/ (2025-12-01) +- **Daily Thread - June 25, 2026** + - https://www.reddit.com/r/weightroom/comments/1uf2qga/daily_thread_june_25_2026/ (2026-06-25) +- **Daily Thread - June 24, 2026** + - https://www.reddit.com/r/weightroom/comments/1ue683y/daily_thread_june_24_2026/ (2026-06-24) +- **Daily Thread - June 23, 2026** + - https://www.reddit.com/r/weightroom/comments/1ud9uyn/daily_thread_june_23_2026/ (2026-06-23) +- **Daily Thread - June 22, 2026** + - https://www.reddit.com/r/weightroom/comments/1ucdeh0/daily_thread_june_22_2026/ (2026-06-22) +- **Daily Thread - June 21, 2026** + - https://www.reddit.com/r/weightroom/comments/1ubj8g5/daily_thread_june_21_2026/ (2026-06-21) +- **Daily Thread - June 20, 2026** + - https://www.reddit.com/r/weightroom/comments/1uapt5n/daily_thread_june_20_2026/ (2026-06-20) +- **Daily Thread - June 19, 2026** + - https://www.reddit.com/r/weightroom/comments/1u9uv60/daily_thread_june_19_2026/ (2026-06-19) +- **Foodie Friday** + - https://www.reddit.com/r/weightroom/comments/1u9wud3/foodie_friday/ (2026-06-19) ### r/advancedfitness -- **[Could not reach r/advancedfitness]** - - Score: 0 | Comments: 0 | ? - - +- **READ BEFORE POSTING! Our rules and guidelines** + - https://old.reddit.com/r/AdvancedFitness/comments/vassb8/read_before_posting_our_rules_and_guidelines/ (2022-06-12) +- **Weekly Simple Questions Thread - October 13, 2025** + - https://old.reddit.com/r/AdvancedFitness/comments/1o5en3a/weekly_simple_questions_thread_october_13_2025/ (2025-10-13) +- **[AF] Skeletal muscle overuse injury: pathophysiological mechanisms, molecular pathways, and rehabilitation strategies (2026)** + - https://old.reddit.com/r/AdvancedFitness/comments/1ubmrnx/af_skeletal_muscle_overuse_injury/ (2026-06-21) +- **[AF] Exercise Training Stimulates the Release of Glutathione Peroxidase 1 (GPX1) Enriched Extracellular Vesicles That Promote Angiogenesis (2026)** + - https://old.reddit.com/r/AdvancedFitness/comments/1ubmr2p/af_exercise_training_stimulates_the_release_of/ (2026-06-21) +- **[AF] Prolonged heat stress induces autophagy in mouse skeletal muscle (2026)** + - https://old.reddit.com/r/AdvancedFitness/comments/1ubmy5x/af_prolonged_heat_stress_induces_autophagy_in/ (2026-06-21) +- **[AF] Myostatin Signaling in Skeletal Muscle: Implications for Athletic Performance (2026)** + - https://old.reddit.com/r/AdvancedFitness/comments/1ubmvr8/af_myostatin_signaling_in_skeletal_muscle/ (2026-06-21) +- **[AF] Chapter Two: Impact of different exercise modalities on mitophagy in human skeletal muscle (2026)** + - https://old.reddit.com/r/AdvancedFitness/comments/1ubmu2s/af_chapter_two_impact_of_different_exercise/ (2026-06-21) +- **[AF] High and Low Load Resistance Training Produce Distinct Skeletal Muscle Growth but Similar Changes in Tendon Morphology (2026)** + - https://old.reddit.com/r/AdvancedFitness/comments/1ub3x69/af_high_and_low_load_resistance_training_produce/ (2026-06-20) +- **[AF] Pulse Rate Variability Is Not the Same as Heart Rate Variability: Implications for Sports Performance and Injury Prevention (2026)** + - https://old.reddit.com/r/AdvancedFitness/comments/1ub4gz2/af_pulse_rate_variability_is_not_the_same_as/ (2026-06-20) +- **[AF] Irisin promotes selective changes in hippocampal mitochondrial metabolism in mice (2026)** + - https://old.reddit.com/r/AdvancedFitness/comments/1u9xguf/af_irisin_promotes_selective_changes_in/ (2026-06-19) ### r/StrongerByScience -- **[Could not reach r/StrongerByScience]** - - Score: 0 | Comments: 0 | ? - - +- **So, what's the deal with this subreddit?** + - https://old.reddit.com/r/StrongerByScience/comments/j7fgfk/so_whats_the_deal_with_this_subreddit/ (2020-10-08) +- **Adjusting Rep Ranges** + - https://old.reddit.com/r/StrongerByScience/comments/1ufoa29/adjusting_rep_ranges/ (2026-06-25) +- **Losing that gear as you age? How does strength declines?** + - https://old.reddit.com/r/StrongerByScience/comments/1uezqor/losing_that_gear_as_you_age_how_does_strength/ (2026-06-25) +- **Rolled Forward Shoulders Fix Advice** + - https://old.reddit.com/r/StrongerByScience/comments/1ufj4xx/rolled_forward_shoulders_fix_advice/ (2026-06-25) +- **Best for chest?** + - https://old.reddit.com/r/StrongerByScience/comments/1udo5e0/best_for_chest/ (2026-06-23) +- **Intermediate program w/lower volume main lifts like SBS article?** + - https://old.reddit.com/r/StrongerByScience/comments/1udnely/intermediate_program_wlower_volume_main_lifts/ (2026-06-23) +- **two interesting statements about bench training for competitors - which do yawl think is more true?** + - https://old.reddit.com/r/StrongerByScience/comments/1udowlo/two_interesting_statements_about_bench_training/ (2026-06-23) +- **Hard Stalled on Strength & Size for 6 Months - Need Advice to break plateau** + - https://old.reddit.com/r/StrongerByScience/comments/1udiws0/hard_stalled_on_strength_size_for_6_months_need/ (2026-06-23) +- **how to fully target lats?** + - https://old.reddit.com/r/StrongerByScience/comments/1ud932j/how_to_fully_target_lats/ (2026-06-23) +- **When to do cardio, and still get maximum out of gym performance** + - https://old.reddit.com/r/StrongerByScience/comments/1ucmlaa/when_to_do_cardio_and_still_get_maximum_out_of/ (2026-06-22) ## PubMed — Recent Articles @@ -76,4 +127,3 @@ _Auto-generated. Not targeted — general scan of recent content._ - **Effects of High-Intensity Interval Training with Blood Flow Restriction Versus Normobaric Hypoxia on Physiological Parameters in Apparently Healthy Young Men.** - Narrea Vargas Jose Jairo, Castillo-Paredes Antonio, Iman Torres Alexander Javier | Sports (Basel, Switzerland) - https://pubmed.ncbi.nlm.nih.gov/42347455/ -