#!/usr/bin/env python3 """ Broad research scan: fetches recent high-signal content from fitness/science sources without targeted search bias. Run occasionally to catch trends, new studies, and community discussions. Sources: - Reddit: r/weightroom, r/advancedfitness, r/StrongerByScience (via RSS + cache) - PubMed: recent articles on general resistance training (via E-utilities) Reddit RSS is rate-limited (~1 request per 10s without getting 429'd). A local cache avoids re-fetching on every run. """ import json import time import sys import os import urllib.parse from datetime import datetime, timezone, timedelta from xml.etree import ElementTree # Requests is available in this project's venv (from fitness-workout deps) try: import requests as req_lib except ImportError: req_lib = None REDDIT_USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0" CACHE_TTL_HOURS = 6 # Re-fetch Reddit RSS if cache older than this CACHE_DIR = None # Set in main() SOURCES = { "reddit": [ "weightroom", "advancedfitness", "StrongerByScience", ], "pubmed_terms": [ "resistance training", "strength training programming", "exercise physiology", ], } def get_cache_path(): """Get path to cache directory, creating it if needed.""" global CACHE_DIR if CACHE_DIR is None: script_dir = os.path.dirname(os.path.abspath(__file__)) # Go up: research/ -> scripts/ -> .agents/ -> project root project_root = os.path.dirname(os.path.dirname(os.path.dirname(script_dir))) CACHE_DIR = os.path.join(project_root, ".agents", "cache", "research") os.makedirs(CACHE_DIR, exist_ok=True) return CACHE_DIR def load_cache(key): """Load cached data for a given key. Returns None if missing/expired.""" cache_path = os.path.join(get_cache_path(), f"{key}.json") if not os.path.exists(cache_path): return None try: with open(cache_path) as f: data = json.load(f) ts = datetime.fromisoformat(data["_cached_at"]) age = datetime.now(timezone.utc) - ts if age < timedelta(hours=CACHE_TTL_HOURS): return data["payload"] else: return None # Stale except (json.JSONDecodeError, KeyError, ValueError): return None def save_cache(key, payload): """Save payload to cache with timestamp.""" cache_path = os.path.join(get_cache_path(), f"{key}.json") data = { "_cached_at": datetime.now(timezone.utc).isoformat(), "payload": payload, } with open(cache_path, "w") as f: json.dump(data, f, indent=2) def fetch_url(url, max_retries=2): """Fetch content from a URL with retries using requests library.""" if req_lib is None: print(" [Warning: requests library not available]", file=sys.stderr) return None headers = { "User-Agent": REDDIT_USER_AGENT, "Accept": "application/xml, text/xml, text/html, */*;q=0.8", "Accept-Language": "en-US,en;q=0.5", } for attempt in range(max_retries): try: resp = req_lib.get(url, headers=headers, timeout=20) resp.raise_for_status() return resp.text except Exception as e: if attempt < max_retries - 1: delay = 3 * (attempt + 1) time.sleep(delay) else: return None def fetch_reddit_posts(subreddit, limit=10): """Fetch top posts from a subreddit via RSS, with cache fallback.""" cache_key = f"reddit_{subreddit}" # Try cache first cached = load_cache(cache_key) if cached is not None: # Add a note about cache age for p in cached: if "_from_cache" not in p: p["_from_cache"] = True return cached # Try main reddit RSS url = f"https://www.reddit.com/r/{subreddit}/hot/.rss?limit={limit}" content = fetch_url(url) if not content: # Fallback: try old.reddit.com (less aggressive blocking) time.sleep(8) # Generous delay to avoid rate limiting url = f"https://old.reddit.com/r/{subreddit}/hot/.rss?limit={limit}" content = fetch_url(url) if not content: # Final fallback: try expired cache stale = load_expired_cache(cache_key) if stale is not None: for p in stale: p["_from_cache"] = True return stale return [{"title": f"[Could not reach r/{subreddit}]", "url": "", "score": 0, "num_comments": 0, "date": "?"}] posts = [] try: root = ElementTree.fromstring(content) ns = {"": "http://www.w3.org/2005/Atom"} for entry in root.findall(".//entry", ns): title_el = entry.find("title", ns) link_el = entry.find("link", ns) updated_el = entry.find("updated", ns) title = title_el.text if title_el is not None else "No title" link = link_el.get("href", "") if link_el is not None else "" updated = updated_el.text[:10] if updated_el is not None and updated_el.text else "?" posts.append({ "title": title, "url": link, "score": "?", "num_comments": "?", "date": updated, "_from_cache": False, }) except Exception as e: print(f" [Warning: RSS parse error for r/{subreddit}: {e}]", file=sys.stderr) stale = load_expired_cache(cache_key) if stale is not None: for p in stale: p["_from_cache"] = True return stale return [{"title": f"[Parse error: {e}]", "url": "", "score": 0, "num_comments": 0, "date": "?"}] if not posts: return [{"title": "No posts found", "url": "", "score": 0, "num_comments": 0, "date": "?"}] # Save to cache save_cache(cache_key, posts) return posts def load_expired_cache(key): """Load cached data even if expired (last resort fallback).""" cache_path = os.path.join(get_cache_path(), f"{key}.json") if not os.path.exists(cache_path): return None try: with open(cache_path) as f: data = json.load(f) return data.get("payload") except (json.JSONDecodeError, KeyError): return None def fetch_pubmed_articles(term, max_results=5): """Fetch recent PubMed articles on a broad topic.""" base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" search_url = ( f"{base_url}esearch.fcgi?" f"db=pubmed&term={urllib.parse.quote(term)}&" f"retmax={max_results}&sort=date&retmode=json" ) content = fetch_url(search_url) if not content: return [{"title": f"[Could not reach PubMed for '{term}']", "url": "", "authors": "", "source": ""}] try: search_data = json.loads(content) except json.JSONDecodeError: return [{"title": f"[Parse error for '{term}']", "url": "", "authors": "", "source": ""}] id_list = search_data.get("esearchresult", {}).get("idlist", []) if not id_list: return [{"title": "No recent articles found", "url": "", "authors": "", "source": ""}] time.sleep(1) details_url = ( f"{base_url}efetch.fcgi?" f"db=pubmed&id={','.join(id_list)}&retmode=xml" ) xml_data = fetch_url(details_url) if not xml_data: return [{"title": f"[Could not fetch details for '{term}']", "url": "", "authors": "", "source": ""}] articles = [] root = ElementTree.fromstring(xml_data) for article in root.findall(".//PubmedArticle")[:max_results]: medline = article.find(".//MedlineCitation") article_data = medline.find(".//Article") if medline is not None else None if article_data is None: continue title_el = article_data.find("ArticleTitle") title = "".join(title_el.itertext()) if title_el is not None else "No title" author_list = article_data.findall(".//Author") authors = [] for author in author_list[:3]: last = author.find("LastName") fore = author.find("ForeName") if last is not None: name = last.text or "" if fore is not None: name += f" {fore.text or ''}" authors.append(name) author_str = ", ".join(authors) if authors else "Unknown" journal = article_data.find(".//Journal/Title") journal_str = journal.text if journal is not None else "Unknown journal" pmid = medline.find(".//PMID") pmid_str = pmid.text if pmid is not None else "" link = f"https://pubmed.ncbi.nlm.nih.gov/{pmid_str}/" if pmid_str else "" articles.append({ "title": title, "url": link, "authors": author_str, "source": journal_str, }) return articles if articles else [{"title": "No structured data", "url": "", "authors": "", "source": ""}] def main(): output = [] output.append(f"# Broad Research Scan — {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}") output.append(f"_Auto-generated. Not targeted — general scan of recent content._\n") # --- Reddit --- output.append("## Reddit\n") for i, subreddit in enumerate(SOURCES["reddit"]): if i > 0: time.sleep(10) # 10s delay between subreddits to avoid rate limiting output.append(f"### r/{subreddit}\n") posts = fetch_reddit_posts(subreddit) for post in posts: date_str = post.get("date", "?") source_note = "" if post.get("_from_cache"): source_note = " (cached)" output.append(f"- **{post['title']}**{source_note}") output.append(f" - {post['url']} ({date_str})") output.append("") # --- PubMed --- output.append("## PubMed — Recent Articles\n") for term in SOURCES["pubmed_terms"]: output.append(f"### Topic: \"{term}\"\n") articles = fetch_pubmed_articles(term) for article in articles: output.append(f"- **{article['title']}**") if article.get("authors") and article.get("source"): output.append(f" - {article['authors']} | {article['source']}") if article.get("url"): output.append(f" - {article['url']}") output.append("") result = "\n".join(output) print(result) # Save to dated file script_dir = os.path.dirname(os.path.abspath(__file__)) project_root = os.path.dirname(os.path.dirname(os.path.dirname(script_dir))) logs_dir = os.path.join(project_root, "logs", "research") os.makedirs(logs_dir, exist_ok=True) date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d") filename = os.path.join(logs_dir, f"{date_str}-broad-scan.md") with open(filename, "w") as f: f.write(result) print(f"\n--- Saved to {filename} ---", file=sys.stderr) if __name__ == "__main__": main()