initial commit

This commit is contained in:
2025-11-05 12:35:09 -05:00
commit 9c4ee28270
15 changed files with 4347 additions and 0 deletions

724
main.py Normal file
View File

@@ -0,0 +1,724 @@
import os
import time
import json
import xml.etree.ElementTree as ET
import requests
from transformers import pipeline
from datetime import datetime, timedelta
from generate_tiktok_feed import save_tiktok_feed
# ======================
# CONFIGURATION
# ======================
def load_config():
"""Load configuration from config.json file."""
config_file = "config.json"
# Default configuration (fallback)
default_config = {
"interests": {
"Efficient ML / Edge AI": {
"query": 'cat:cs.LG OR cat:cs.CV OR cat:cs.CL',
"keywords": ['efficient', 'edge', 'compression', 'quantization', 'pruning', 'distillation', 'inference', 'lightweight', 'mobile', 'accelerat']
}
},
"settings": {
"papers_per_interest": 10,
"summary_max_length": 160,
"recent_days": 7,
"fallback_days": 90,
"min_papers_threshold": 5,
"fetch_multiplier": 5,
"user_agent": "ResearchDigestBot/1.0 (github.com/wedsmoker)"
}
}
if os.path.exists(config_file):
try:
with open(config_file, 'r', encoding='utf-8') as f:
config = json.load(f)
print(f"✅ Loaded configuration from {config_file}")
return config
except Exception as e:
print(f"⚠️ Error loading config file: {e}. Using defaults.")
return default_config
else:
print(f"⚠️ {config_file} not found. Using default configuration.")
return default_config
# Load configuration
config = load_config()
INTERESTS = config.get('interests', {})
settings = config.get('settings', {})
PAPERS_PER_INTEREST = settings.get('papers_per_interest', 10)
SUMMARY_MAX_LENGTH = settings.get('summary_max_length', 160)
USER_AGENT = settings.get('user_agent', 'ResearchDigestBot/1.0')
# Date filtering: Only fetch papers from the last N days (set to 0 to disable)
RECENT_DAYS = settings.get('recent_days', 7)
FALLBACK_DAYS = settings.get('fallback_days', 90)
MIN_PAPERS_THRESHOLD = settings.get('min_papers_threshold', 5)
FETCH_MULTIPLIER = settings.get('fetch_multiplier', 5)
# Deduplication: Track papers we've already shown
SEEN_PAPERS_FILE = "seen_papers.json"
# Initialize summarizer (optional)
try:
summarizer = pipeline(
"summarization",
model="sshleifer/distilbart-cnn-12-6",
device=-1
)
except Exception as e:
print(f"⚠️ Summarizer unavailable ({e}). Using raw abstracts.")
summarizer = None
# ======================
# DEDUPLICATION HELPERS
# ======================
def load_seen_papers():
"""Load the set of previously seen paper IDs."""
if os.path.exists(SEEN_PAPERS_FILE):
try:
with open(SEEN_PAPERS_FILE, 'r') as f:
data = json.load(f)
return set(data.get('seen_ids', []))
except Exception as e:
print(f"⚠️ Error loading seen papers: {e}")
return set()
def save_seen_papers(seen_ids):
"""Save the set of seen paper IDs."""
try:
with open(SEEN_PAPERS_FILE, 'w') as f:
json.dump({
'seen_ids': list(seen_ids),
'last_updated': datetime.now().isoformat()
}, f, indent=2)
except Exception as e:
print(f"⚠️ Error saving seen papers: {e}")
def get_date_filter(days=None):
"""Generate date filter for arXiv query (last N days)."""
if days is None:
days = RECENT_DAYS
if days <= 0:
return ""
end_date = datetime.now()
start_date = end_date - timedelta(days=days)
# arXiv date format: YYYYMMDD0000 to YYYYMMDD2359
date_filter = f"submittedDate:[{start_date.strftime('%Y%m%d')}0000 TO {end_date.strftime('%Y%m%d')}2359]"
return date_filter
# ======================
# ARXIV FETCH & PARSE
# ======================
def fetch_arxiv_papers(query, max_results=5, days_back=None):
url = "http://export.arxiv.org/api/query"
# Add date filter if configured
date_filter = get_date_filter(days_back)
if date_filter:
# Combine user query with date filter using AND
query = f"({query}) AND {date_filter}"
params = {
"search_query": query,
"start": 0,
"max_results": max_results,
"sortBy": "submittedDate",
"sortOrder": "descending"
}
headers = {"User-Agent": USER_AGENT}
try:
response = requests.get(url, params=params, headers=headers, timeout=20)
response.raise_for_status()
return response.text
except Exception as e:
print(f"❌ Error fetching query '{query}': {e}")
return None
def parse_papers(xml_data):
if not xml_data:
return []
try:
root = ET.fromstring(xml_data)
except ET.ParseError:
return []
namespace = {'atom': 'http://www.w3.org/2005/Atom'}
papers = []
for entry in root.findall('atom:entry', namespace):
title_elem = entry.find('atom:title', namespace)
summary_elem = entry.find('atom:summary', namespace)
id_elem = entry.find('atom:id', namespace)
published_elem = entry.find('atom:published', namespace)
if None in (title_elem, summary_elem, id_elem):
continue
title = ' '.join(title_elem.text.strip().split())
summary = ' '.join(summary_elem.text.strip().split())
link = id_elem.text
published = published_elem.text.split('T')[0] if published_elem is not None else "Unknown"
# Extract arXiv ID
arxiv_id = link.split('/abs/')[-1].split('v')[0]
# Get primary category
primary_cat_elem = entry.find('.//{http://arxiv.org/schemas/atom}primary_category')
category = primary_cat_elem.get('term') if primary_cat_elem is not None else "unknown"
papers.append({
'title': title,
'summary': summary,
'link': link,
'pdf_link': f"https://arxiv.org/pdf/{arxiv_id}.pdf",
'arxiv_id': arxiv_id,
'category': category,
'published': published
})
return papers
def summarize_abstract(abstract):
if summarizer is None:
return abstract[:SUMMARY_MAX_LENGTH] + ("..." if len(abstract) > SUMMARY_MAX_LENGTH else "")
try:
if len(abstract.split()) < 15:
return abstract
result = summarizer(
abstract,
max_length=min(SUMMARY_MAX_LENGTH, 142),
min_length=30,
truncation=True
)
return result[0]['summary_text']
except Exception as e:
return abstract[:SUMMARY_MAX_LENGTH] + "..."
def calculate_relevance_score(paper, keywords):
"""Calculate relevance score based on keyword matches in title and abstract."""
title_lower = paper['title'].lower()
abstract_lower = paper['summary'].lower()
score = 0
matched_keywords = []
for keyword in keywords:
keyword_lower = keyword.lower()
# Title matches are worth more
if keyword_lower in title_lower:
score += 3
matched_keywords.append(keyword)
# Abstract matches
elif keyword_lower in abstract_lower:
score += 1
matched_keywords.append(keyword)
# Bonus for multiple keyword matches
if len(matched_keywords) > 2:
score += len(matched_keywords) - 2
paper['relevance_score'] = score
paper['matched_keywords'] = matched_keywords
return score
def estimate_difficulty(abstract, category):
"""Estimate paper difficulty using heuristic keyword analysis."""
abstract_lower = abstract.lower()
# Theory-heavy indicators
complexity_words = ['theoretical', 'proof', 'theorem', 'convergence', 'optimal',
'asymptotic', 'lemma', 'proposition', 'rigorous', 'formalism']
# Applied/practical indicators
applied_words = ['system', 'framework', 'application', 'dataset', 'benchmark',
'implementation', 'experiment', 'empirical', 'practical']
# Math-heavy categories
math_categories = ['math.', 'stat.', 'quant-ph']
# Calculate score
score = sum(1 for w in complexity_words if w in abstract_lower)
score -= sum(0.5 for w in applied_words if w in abstract_lower)
# Category bonus
if any(cat in category for cat in math_categories):
score += 1
# Determine difficulty level
if score > 2:
return "🔴 Theory-Heavy"
elif score > 0.5:
return "🟡 Advanced"
else:
return "🟢 Applied"
def generate_layman_context(title, abstract):
"""Generate simple layman explanation using keyword extraction and templates."""
abstract_lower = abstract.lower()
# Extract key action words and concepts
action_map = {
'improv': 'improves',
'reduc': 'reduces',
'enhanc': 'enhances',
'optimi': 'optimizes',
'acceler': 'speeds up',
'efficient': 'makes more efficient',
'novel': 'introduces a new approach to',
'outperform': 'works better than existing methods for',
'achiev': 'achieves better',
'propose': 'proposes a method for',
'present': 'presents techniques for',
'address': 'tackles the problem of',
'privacy': 'protecting data privacy in',
'federated': 'distributed machine learning across',
'emotion': 'understanding emotions in',
'embedded': 'running AI on low-power devices for',
'edge': 'running AI locally on devices for',
'compression': 'making models smaller for',
'inference': 'faster predictions in',
'generative': 'creating new content with',
'detection': 'automatically finding',
'classification': 'categorizing',
'prediction': 'forecasting'
}
# Find first matching action
action = "explores techniques in"
for keyword, phrase in action_map.items():
if keyword in abstract_lower[:300]: # Check first part of abstract
action = phrase
break
# Extract domain
domain = "machine learning"
if "language model" in abstract_lower or "llm" in abstract_lower or "nlp" in abstract_lower:
domain = "language AI"
elif "vision" in abstract_lower or "image" in abstract_lower or "visual" in abstract_lower:
domain = "computer vision"
elif "speech" in abstract_lower or "audio" in abstract_lower:
domain = "speech processing"
elif "privacy" in abstract_lower or "federated" in abstract_lower:
domain = "privacy-preserving AI"
elif "edge" in abstract_lower or "embedded" in abstract_lower or "device" in abstract_lower:
domain = "edge computing"
elif "emotion" in abstract_lower or "affective" in abstract_lower:
domain = "emotion AI"
return f"This research {action} {domain}."
# ======================
# HTML OUTPUT
# ======================
def save_html_digest(all_papers_by_interest, filename=None):
# Create archive directory if it doesn't exist
archive_dir = "arxiv_archive"
if not os.path.exists(archive_dir):
os.makedirs(archive_dir)
if filename is None:
date_str = datetime.now().strftime('%Y%m%d')
filename = os.path.join(archive_dir, f"arxiv_digest_{date_str}.html")
# Also save as latest.html for easy syncing
latest_file = "latest.html"
html = f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0"/>
<title>arXiv Digest • {datetime.now().strftime('%Y-%m-%d')}</title>
<style>
* {{ box-sizing: border-box; }}
:root {{
--bg: #0f0f0f;
--text: #e8e8e8;
--muted: #999;
--border: #2a2a2a;
--card-bg: #1a1a1a;
--link: #6ba3ff;
--accent: #ff6b6b;
--green: #51cf66;
--yellow: #ffd43b;
--red: #ff6b6b;
--layman-bg: #1f2937;
--layman-border: #60a5fa;
}}
body {{
font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
line-height: 1.5;
color: var(--text);
background: var(--bg);
margin: 0;
padding: 1rem;
}}
.container {{
max-width: 1600px;
margin: 0 auto;
}}
header {{
text-align: center;
padding: 2rem 1rem 3rem;
border-bottom: 2px solid var(--border);
margin-bottom: 2rem;
}}
h1 {{
font-weight: 900;
font-size: 2.5rem;
margin: 0;
background: linear-gradient(135deg, var(--accent), #ffa94d);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
background-clip: text;
}}
.meta {{
color: var(--muted);
font-size: 0.95rem;
margin-top: 0.5rem;
letter-spacing: 0.5px;
}}
.interest-section {{
margin-bottom: 3rem;
}}
.interest-header {{
display: flex;
align-items: center;
gap: 0.8rem;
margin-bottom: 1.2rem;
padding: 0.8rem 1rem;
background: var(--card-bg);
border-radius: 12px;
border-left: 4px solid var(--accent);
}}
.interest-title {{
font-size: 1.3rem;
margin: 0;
font-weight: 700;
color: var(--text);
}}
.papers-grid {{
display: grid;
grid-template-columns: repeat(auto-fill, minmax(380px, 1fr));
gap: 1.2rem;
}}
.paper {{
background: var(--card-bg);
border: 1px solid var(--border);
border-radius: 12px;
padding: 1.2rem;
transition: all 0.2s ease;
position: relative;
display: flex;
flex-direction: column;
height: 100%;
}}
.paper:hover {{
border-color: var(--accent);
transform: translateY(-2px);
box-shadow: 0 8px 24px rgba(255, 107, 107, 0.15);
}}
.paper-header {{
display: flex;
justify-content: space-between;
align-items: flex-start;
gap: 0.8rem;
margin-bottom: 0.8rem;
}}
.difficulty-badge {{
padding: 0.3rem 0.7rem;
border-radius: 20px;
font-size: 0.7rem;
font-weight: 700;
white-space: nowrap;
flex-shrink: 0;
}}
.paper h3 {{
font-size: 1.05rem;
margin: 0 0 0.8rem 0;
font-weight: 700;
line-height: 1.4;
color: var(--text);
}}
.layman-box {{
background: var(--layman-bg);
border-left: 3px solid var(--layman-border);
padding: 0.7rem 0.9rem;
margin-bottom: 0.8rem;
border-radius: 6px;
font-size: 0.88rem;
line-height: 1.5;
color: #94a3b8;
font-style: italic;
}}
.summary {{
color: var(--muted);
margin-bottom: 1rem;
font-size: 0.88rem;
line-height: 1.6;
flex-grow: 1;
}}
.paper-footer {{
display: flex;
justify-content: space-between;
align-items: center;
padding-top: 0.8rem;
border-top: 1px solid var(--border);
margin-top: auto;
}}
.category-tag {{
background: #1e3a5f;
color: #60a5fa;
padding: 0.25rem 0.65rem;
border-radius: 15px;
font-size: 0.75rem;
font-weight: 600;
}}
.date {{
color: var(--muted);
font-size: 0.75rem;
}}
.links {{
display: flex;
gap: 1rem;
margin-top: 0.8rem;
}}
.links a {{
color: var(--link);
text-decoration: none;
font-size: 0.85rem;
font-weight: 600;
transition: color 0.2s;
}}
.links a:hover {{
color: var(--accent);
}}
.footer {{
text-align: center;
margin-top: 4rem;
padding: 2rem;
color: var(--muted);
font-size: 0.85rem;
border-top: 1px solid var(--border);
}}
@media (max-width: 768px) {{
.papers-grid {{
grid-template-columns: 1fr;
}}
h1 {{
font-size: 2rem;
}}
}}
</style>
</head>
<body>
<div class="container">
<header>
<h1>arXiv Research Digest</h1>
<div class="meta">{datetime.now().strftime('%B %d, %Y')}{sum(len(p) for p in all_papers_by_interest.values())} papers across {len(all_papers_by_interest)} interests</div>
</header>
"""
for interest_name, papers in all_papers_by_interest.items():
html += f"""<section class="interest-section">
<div class="interest-header">
<span>🔬</span>
<h2 class="interest-title">{interest_name}</h2>
</div>
"""
if not papers:
html += ' <p>No recent papers found.</p>\n'
else:
html += ' <div class="papers-grid">\n'
for paper in papers:
html += f""" <article class="paper">
<div class="paper-header">
<span class="difficulty-badge">{paper['difficulty']}</span>
</div>
<h3>{paper['title']}</h3>
<div class="layman-box">💡 {paper['layman']}</div>
<div class="summary">{paper['summary']}</div>
<div class="paper-footer">
<span class="category-tag">{paper['category']}</span>
<span class="date">{paper['published']}</span>
</div>
<div class="links">
<a href="{paper['link']}" target="_blank">Abstract ↗</a>
<a href="{paper['pdf_link']}" target="_blank">PDF ↗</a>
</div>
</article>
"""
html += ' </div>\n'
html += "</section>\n"
html += """ <div class="footer">
✨ Generated automatically • Powered by arXiv API
</div>
</div>
</body>
</html>
"""
# Save archived version
with open(filename, 'w', encoding='utf-8') as f:
f.write(html)
print(f"✨ HTML digest saved to {filename}")
# Also save as latest.html for quick access
with open(latest_file, 'w', encoding='utf-8') as f:
f.write(html)
print(f"📄 Latest digest saved to {latest_file}")
# ======================
# MAIN EXECUTION
# ======================
if __name__ == "__main__":
# Load previously seen papers
seen_papers = load_seen_papers()
print(f"📋 Loaded {len(seen_papers)} previously seen papers")
if RECENT_DAYS > 0:
print(f"📅 Fetching papers from last {RECENT_DAYS} days")
else:
print("📅 Fetching all available papers (no date filter)")
all_papers = {}
new_papers_count = 0
duplicate_count = 0
for interest_name, interest_config in INTERESTS.items():
query = interest_config['query']
keywords = interest_config['keywords']
print(f"\n🔍 Fetching papers for: {interest_name}")
xml_data = fetch_arxiv_papers(query, PAPERS_PER_INTEREST * FETCH_MULTIPLIER) # Fetch more to filter
papers = parse_papers(xml_data) if xml_data else []
print(f" → Found {len(papers)} papers")
# Filter out duplicates and calculate relevance
fresh_papers = []
for p in papers:
if p['arxiv_id'] not in seen_papers:
# Store original abstract for analysis
original_abstract = p['summary']
# Calculate relevance score FIRST (before summarization)
calculate_relevance_score(p, keywords)
# Estimate difficulty level (use ORIGINAL abstract before summarization)
p['difficulty'] = estimate_difficulty(original_abstract, p['category'])
# Generate layman context (use ORIGINAL abstract for better keyword extraction)
p['layman'] = generate_layman_context(p['title'], original_abstract)
# Generate summary (do this last to avoid losing original abstract)
p['summary'] = summarize_abstract(original_abstract)
fresh_papers.append(p)
else:
duplicate_count += 1
# Sort by relevance score (highest first)
fresh_papers.sort(key=lambda x: x['relevance_score'], reverse=True)
# Take top N papers
top_papers = fresh_papers[:PAPERS_PER_INTEREST]
# Mark these papers as seen
for p in top_papers:
seen_papers.add(p['arxiv_id'])
new_papers_count += 1
all_papers[interest_name] = top_papers
print(f"{len(top_papers)} new papers (from {len(fresh_papers)} candidates, skipped {len(papers) - len(fresh_papers)} duplicates)")
if top_papers:
print(f" 📊 Relevance scores: {[p['relevance_score'] for p in top_papers]}")
# FALLBACK: If we didn't get enough papers, try wider date range (only 1 extra request)
if len(top_papers) < MIN_PAPERS_THRESHOLD and FALLBACK_DAYS > RECENT_DAYS:
print(f" 🔄 Low yield, trying fallback search (last {FALLBACK_DAYS} days)...")
time.sleep(3) # Respect rate limit before fallback request
xml_data_fallback = fetch_arxiv_papers(query, PAPERS_PER_INTEREST * FETCH_MULTIPLIER, days_back=FALLBACK_DAYS)
papers_fallback = parse_papers(xml_data_fallback) if xml_data_fallback else []
print(f" → Found {len(papers_fallback)} papers in fallback")
# Process fallback papers
fallback_fresh = []
for p in papers_fallback:
if p['arxiv_id'] not in seen_papers:
original_abstract = p['summary']
calculate_relevance_score(p, keywords)
p['difficulty'] = estimate_difficulty(original_abstract, p['category'])
p['layman'] = generate_layman_context(p['title'], original_abstract)
p['summary'] = summarize_abstract(original_abstract)
fallback_fresh.append(p)
# Sort fallback papers by relevance
fallback_fresh.sort(key=lambda x: x['relevance_score'], reverse=True)
# Add top fallback papers to fill quota
needed = PAPERS_PER_INTEREST - len(top_papers)
additional_papers = fallback_fresh[:needed]
for p in additional_papers:
seen_papers.add(p['arxiv_id'])
new_papers_count += 1
top_papers.extend(additional_papers)
all_papers[interest_name] = top_papers
print(f" ✨ After fallback: {len(top_papers)} total papers")
# Be kind: 3-second delay between queries (arXiv recommendation)
time.sleep(3)
# Save updated seen papers
save_seen_papers(seen_papers)
print(f"\n📊 Summary:")
print(f" • Total new papers: {new_papers_count}")
print(f" • Total duplicates skipped: {duplicate_count}")
print(f" • Total tracked papers: {len(seen_papers)}")
save_html_digest(all_papers)
save_tiktok_feed(all_papers)
print("\n✅ Done! Open the HTML files in your browser.")