fitness-agent/.agents/scripts/research/broad_scan.py

#!/usr/bin/env python3
"""
Broad research scan: fetches recent high-signal content from fitness/science
sources without targeted search bias. Run occasionally to catch trends,
new studies, and community discussions.

Sources:
  - Reddit: r/weightroom, r/advancedfitness, r/StrongerByScience
  - PubMed: recent articles on general resistance training
"""

import json
import urllib.request
import urllib.parse
import time
import sys
from datetime import datetime, timezone
from xml.etree import ElementTree

REDDIT_USER_AGENT = "fitness-agent/1.0 (research script; for personal training logs)"
SOURCES = {
    "reddit": [
        "weightroom",
        "advancedfitness",
        "StrongerByScience",
    ],
    "pubmed_terms": [
        "resistance training",
        "strength training programming",
        "exercise physiology",
    ],
}


def fetch_url(url, max_retries=2):
    """Fetch content from a URL with retries."""
    for attempt in range(max_retries):
        try:
            req = urllib.request.Request(url, headers={"User-Agent": REDDIT_USER_AGENT})
            with urllib.request.urlopen(req, timeout=15) as resp:
                return resp.read().decode()
        except Exception as e:
            if attempt < max_retries - 1:
                time.sleep(3)
            else:
                return None


def fetch_reddit_posts(subreddit, limit=10):
    """Fetch top posts from a subreddit via RSS."""
    # Use RSS feed (works without auth)
    url = f"https://www.reddit.com/r/{subreddit}/hot/.rss?limit={limit}"
    content = fetch_url(url)
    if not content:
        # Fallback: try old.reddit.com
        url = f"https://old.reddit.com/r/{subreddit}/hot/.rss?limit={limit}"
        content = fetch_url(url)

    if not content:
        return [{"title": f"[Could not reach r/{subreddit}]", "url": "", "score": 0, "num_comments": 0}]

    posts = []
    try:
        root = ElementTree.fromstring(content)
        # RSS namespace
        ns = {"": "http://www.w3.org/2005/Atom"}
        for entry in root.findall(".//entry", ns):
            title_el = entry.find("title", ns)
            link_el = entry.find("link", ns)
            updated_el = entry.find("updated", ns)
            title = title_el.text if title_el is not None else "No title"
            link = link_el.get("href", "") if link_el is not None else ""
            updated = updated_el.text[:10] if updated_el is not None and updated_el.text else "?"
            posts.append({
                "title": title,
                "url": link,
                "score": "?",
                "num_comments": "?",
                "date": updated,
            })
    except Exception as e:
        return [{"title": f"[Parse error: {e}]", "url": "", "score": 0, "num_comments": 0}]

    return posts if posts else [{"title": "No posts found", "url": "", "score": 0, "num_comments": 0}]


def fetch_pubmed_articles(term, max_results=5):
    """Fetch recent PubMed articles on a broad topic."""
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"

    # Search with retmode=json
    search_url = (
        f"{base_url}esearch.fcgi?"
        f"db=pubmed&term={urllib.parse.quote(term)}&"
        f"retmax={max_results}&sort=date&retmode=json"
    )
    content = fetch_url(search_url)
    if not content:
        return [{"title": f"[Could not reach PubMed for '{term}']", "url": "", "authors": "", "source": ""}]

    try:
        search_data = json.loads(content)
    except json.JSONDecodeError:
        return [{"title": f"[Parse error for '{term}']", "url": "", "authors": "", "source": ""}]

    id_list = search_data.get("esearchresult", {}).get("idlist", [])
    if not id_list:
        return [{"title": "No recent articles found", "url": "", "authors": "", "source": ""}]

    # Respect NCBI rate limits: max 3 requests per second without API key
    time.sleep(1)

    # Fetch details as XML
    details_url = (
        f"{base_url}efetch.fcgi?"
        f"db=pubmed&id={','.join(id_list)}&retmode=xml"
    )
    xml_data = fetch_url(details_url)
    if not xml_data:
        return [{"title": f"[Could not fetch details for '{term}']", "url": "", "authors": "", "source": ""}]

    articles = []
    root = ElementTree.fromstring(xml_data)
    for article in root.findall(".//PubmedArticle")[:max_results]:
        medline = article.find(".//MedlineCitation")
        article_data = medline.find(".//Article") if medline is not None else None
        if article_data is None:
            continue

        title_el = article_data.find("ArticleTitle")
        title = "".join(title_el.itertext()) if title_el is not None else "No title"

        # Authors
        author_list = article_data.findall(".//Author")
        authors = []
        for author in author_list[:3]:
            last = author.find("LastName")
            fore = author.find("ForeName")
            if last is not None:
                name = last.text or ""
                if fore is not None:
                    name += f" {fore.text or ''}"
                authors.append(name)
        author_str = ", ".join(authors) if authors else "Unknown"

        # Journal
        journal = article_data.find(".//Journal/Title")
        journal_str = journal.text if journal is not None else "Unknown journal"

        # PMID & link
        pmid = medline.find(".//PMID")
        pmid_str = pmid.text if pmid is not None else ""
        link = f"https://pubmed.ncbi.nlm.nih.gov/{pmid_str}/" if pmid_str else ""

        articles.append({
            "title": title,
            "url": link,
            "authors": author_str,
            "source": journal_str,
        })

    return articles if articles else [{"title": "No structured data", "url": "", "authors": "", "source": ""}]


def main():
    output = []
    output.append(f"# Broad Research Scan — {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}")
    output.append(f"_Auto-generated. Not targeted — general scan of recent content._\n")

    # --- Reddit ---
    output.append("## Reddit\n")
    for subreddit in SOURCES["reddit"]:
        output.append(f"### r/{subreddit}\n")
        posts = fetch_reddit_posts(subreddit)
        for post in posts:
            date_str = post.get("date", "?")
            score_str = post.get("score", "?")
            comments_str = post.get("num_comments", "?")
            output.append(f"- **{post['title']}**")
            if score_str != "?" or comments_str != "?" or date_str != "?":
                output.append(f"  - Score: {score_str} | Comments: {comments_str} | {date_str}")
            output.append(f"  - {post['url']}")
        output.append("")

    # --- PubMed ---
    output.append("## PubMed — Recent Articles\n")
    for term in SOURCES["pubmed_terms"]:
        output.append(f"### Topic: \"{term}\"\n")
        articles = fetch_pubmed_articles(term)
        for article in articles:
            output.append(f"- **{article['title']}**")
            if article.get("authors") and article.get("source"):
                output.append(f"  - {article['authors']} | {article['source']}")
            if article.get("url"):
                output.append(f"  - {article['url']}")
        output.append("")

    result = "\n".join(output)
    print(result)

    # Save to dated file
    import os
    logs_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), "logs", "research")
    os.makedirs(logs_dir, exist_ok=True)
    date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
    filename = os.path.join(logs_dir, f"{date_str}-broad-scan.md")
    with open(filename, "w") as f:
        f.write(result)
    print(f"\n--- Saved to {filename} ---", file=sys.stderr)


if __name__ == "__main__":
    main()