commit 9c4ee28270b76a7624bf2aa4fb4a75e18db25637 Author: wwelsh Date: Wed Nov 5 12:35:09 2025 -0500 initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4136964 --- /dev/null +++ b/.gitignore @@ -0,0 +1,40 @@ +# Virtual Environment +venv/ +env/ +ENV/ +.venv/ + +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python + +# ArXiv Downloads +arxiv_archive/ +*.pdf + +# Generated Files +seen_papers.json +latest.html +index.html + +# Syncthing +.stfolder/ +.stignore + +# IDE / Editor +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# OS +.DS_Store +Thumbs.db +desktop.ini + +# Logs +*.log diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..b77bf2a --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2025 + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..1d66199 --- /dev/null +++ b/README.md @@ -0,0 +1,203 @@ +![Python](https://img.shields.io/badge/python-3.8+-blue.svg) +![License](https://img.shields.io/badge/license-MIT-green.svg) +![arXiv](https://img.shields.io/badge/arXiv-API-red.svg) +![Platform](https://img.shields.io/badge/platform-windows%20%7C%20linux%20%7C%20macos-lightgrey.svg) + +# ๐Ÿ“š Research Digest + +**Automated daily research paper digest from arXiv with smart filtering, mobile-friendly interface, and AI-powered summaries.** + +Fetch, filter, and browse the latest research papers tailored to your interests. Desktop grid view for deep reading, mobile feed for quick scrolling. + +--- + +## โœจ Features + +- **๐ŸŽฏ Smart Filtering** - Keyword-based relevance scoring across custom research interests +- **๐Ÿ“ฑ Mobile Feed** - Swipeable, full-screen card interface optimized for phones +- **๐Ÿ–ฅ๏ธ Desktop Grid** - Multi-column layout with rich metadata and difficulty badges +- **๐Ÿง  AI Summaries** - Auto-generated layman explanations using transformers +- **๐Ÿ”„ Deduplication** - Never see the same paper twice with built-in tracking +- **โš™๏ธ Configurable** - JSON-based settings for interests, filters, and preferences +- **๐Ÿ“ฆ Archive** - Auto-saves daily digests with browsable index + +--- + +## ๐Ÿ–ผ๏ธ Screenshots + +### Desktop View +![Desktop Demo](desktop_demo.png) + +### Mobile Feed +![Mobile Demo](mobile_demo.png) + +--- + +## ๐Ÿš€ Quick Start + +### Windows + +1. **Clone & Run** + ```bash + git clone https://github.com/yourusername/research-digest.git + cd research-digest + run_digest.bat + ``` + +2. **First run automatically:** + - Creates virtual environment + - Installs dependencies + - Fetches papers + - Generates HTML digests + +3. **Open in browser:** + - `latest.html` - Most recent digest + - `index.html` - Browse all archives + - `tiktok_feed.html` - Mobile-optimized feed + +### Linux/macOS + +```bash +git clone https://github.com/yourusername/research-digest.git +cd research-digest +python -m venv venv +source venv/bin/activate # On Windows: venv\Scripts\activate +pip install -r requirements.txt +python main.py +python generate_index.py +``` + +--- + +## โš™๏ธ Configuration + +Edit `config.json` to customize: + +```json +{ + "interests": { + "Your Research Area": { + "query": "cat:cs.LG OR cat:cs.AI", + "keywords": ["keyword1", "keyword2", "keyword3"] + } + }, + "settings": { + "papers_per_interest": 10, + "recent_days": 7, + "summary_max_length": 160 + } +} +``` + +### Available Settings + +| Setting | Default | Description | +|---------|---------|-------------| +| `papers_per_interest` | 10 | Papers to fetch per category | +| `recent_days` | 7 | Look back window (0 = all time) | +| `fallback_days` | 90 | Extended search if few results | +| `summary_max_length` | 160 | Max characters for summaries | +| `fetch_multiplier` | 5 | Over-fetch for better filtering | + +--- + +## ๐Ÿ“– arXiv Query Syntax + +Use arXiv category codes in queries: + +- `cat:cs.LG` - Machine Learning +- `cat:cs.CV` - Computer Vision +- `cat:cs.CL` - Computation & Language (NLP) +- `cat:cs.AI` - Artificial Intelligence +- `cat:cs.CR` - Cryptography & Security +- `cat:cs.DC` - Distributed Computing + +Combine with `OR`/`AND`: `cat:cs.LG OR cat:cs.AI` + +[Full category list](https://arxiv.org/category_taxonomy) + +--- + +## ๐Ÿ”ง Advanced Usage + +### Automated Daily Digests + +**Windows Task Scheduler:** +1. Open Task Scheduler +2. Create Basic Task โ†’ Daily โ†’ 7:00 AM +3. Action: Start Program โ†’ `C:\path\to\run_digest.bat` + +**Linux/macOS Cron:** +```bash +0 7 * * * cd /path/to/research-digest && ./venv/bin/python main.py && ./venv/bin/python generate_index.py +``` + +### Sync to Mobile (Syncthing) + +1. Install [Syncthing](https://syncthing.net/) on PC and phone +2. Share project folder +3. Access HTML files directly on phone + +### Reset Seen Papers + +```bash +python reset_seen_papers.py +``` + +--- + +## ๐Ÿ“‚ Project Structure + +``` +research-digest/ +โ”œโ”€โ”€ config.json # Configuration (edit this!) +โ”œโ”€โ”€ main.py # Core paper fetcher +โ”œโ”€โ”€ generate_index.py # Archive browser generator +โ”œโ”€โ”€ generate_tiktok_feed.py # Mobile feed generator +โ”œโ”€โ”€ run_digest.bat # Windows launcher +โ”œโ”€โ”€ requirements.txt # Python dependencies +โ”œโ”€โ”€ latest.html # Latest digest (auto-generated) +โ”œโ”€โ”€ index.html # Archive browser (auto-generated) +โ”œโ”€โ”€ tiktok_feed.html # Mobile feed (auto-generated) +โ”œโ”€โ”€ seen_papers.json # Deduplication tracker +โ””โ”€โ”€ arxiv_archive/ # Daily archives + โ”œโ”€โ”€ arxiv_digest_20251101.html + โ””โ”€โ”€ ... +``` + +--- + +## ๐Ÿ› ๏ธ Requirements + +- **Python 3.8+** +- **Dependencies:** `transformers`, `torch`, `requests` +- **Disk Space:** ~2GB for model, ~10MB per digest +- **Internet:** Required for arXiv API and first-time model download + +--- + +## ๐Ÿ“ License + +MIT License - see [LICENSE](LICENSE) file for details + +--- + +## ๐Ÿค Contributing + +Contributions welcome! Ideas: +- Additional paper sources (bioRxiv, SSRN, etc.) +- Browser extension for direct syncing +- Custom ML models for better summaries +- Export to Notion/Obsidian/Roam + +--- + +## ๐Ÿ™ Acknowledgments + +- [arXiv](https://arxiv.org/) for the open research repository +- [Hugging Face](https://huggingface.co/) for transformer models +- Inspired by modern feed UIs and research workflows + +--- + +**Built with โค๏ธ for researchers who want to stay current without drowning in papers** diff --git a/SETUP_GUIDE.md b/SETUP_GUIDE.md new file mode 100644 index 0000000..1aa9c8b --- /dev/null +++ b/SETUP_GUIDE.md @@ -0,0 +1,198 @@ +# ๐Ÿ“ฑ Syncthing + Daily arXiv Digest Setup Guide + +## ๐ŸŽฏ What This Does +- Automatically runs your arXiv digest **every morning at 7 AM** +- Archives each day's report in `arxiv_archive/` +- Creates `latest.html` for quick access +- Generates `index.html` to browse all past reports +- Syncs everything to your phone via Syncthing + +--- + +## โš™๏ธ Step 1: Set Up Windows Task Scheduler + +### Option A: Quick Setup (Copy-Paste This) +1. Press `Win + R`, type `taskschd.msc`, press Enter +2. Click **"Create Basic Task"** in the right panel +3. Fill in: + - **Name:** `arXiv Daily Digest` + - **Description:** `Fetches daily research papers and syncs to phone` +4. **Trigger:** Select "Daily" + - Start date: Today + - Start time: **7:00 AM** + - Recur every: **1 days** +5. **Action:** Select "Start a program" + - **Program/script:** `C:\Users\Admin\python\1aResearch\run_digest.bat` + - **Start in:** `C:\Users\Admin\python\1aResearch` +6. Check **"Open the Properties dialog"** at the end +7. In Properties: + - Go to **Conditions** tab + - โœ… Check "Start only if the following network connection is available" โ†’ Select "Any connection" + - โŒ Uncheck "Start the task only if the computer is on AC power" +8. Click **OK** + +### Option B: Advanced Settings +If you want to run it at startup instead: +- Change Trigger to **"At log on"** +- Add a 2-minute delay: In Properties โ†’ Triggers โ†’ Edit โ†’ Delay task for: **2 minutes** + +--- + +## ๐Ÿ“‚ Step 2: Set Up Syncthing + +### On Your PC: +1. Open Syncthing web UI (usually `http://localhost:8384`) +2. Click **"Add Folder"** + - **Folder Path:** `C:\Users\Admin\python\1aResearch` + - **Folder Label:** `arXiv Research` + - **Folder ID:** `arxiv-research` (auto-generated) +3. Go to **"Sharing"** tab +4. Click **"Add Device"** and enter your phone's Device ID + +### On Your Phone: +1. Install **Syncthing** from Play Store / App Store +2. Open app โ†’ **Add Device** โ†’ Scan QR code from PC +3. Accept the folder share request (`arXiv Research`) +4. Set sync folder location (e.g., `/storage/emulated/0/arXiv/`) + +### What Gets Synced: +``` +1aResearch/ +โ”œโ”€โ”€ latest.html โ† Most recent digest (quick access) +โ”œโ”€โ”€ index.html โ† Browse all reports +โ””โ”€โ”€ arxiv_archive/ + โ”œโ”€โ”€ arxiv_digest_20251101.html + โ”œโ”€โ”€ arxiv_digest_20251102.html + โ””โ”€โ”€ ... (daily backups) +``` + +--- + +## ๐Ÿ“ฑ Step 3: View on Your Phone + +### Method 1: Direct File Access +1. Open your phone's file manager +2. Navigate to the Syncthing folder (e.g., `arXiv/`) +3. Open `latest.html` with any browser +4. Open `index.html` to browse past reports + +### Method 2: Use a Local HTML Viewer App +Install **"HTML Viewer"** or **"WebView Tester"** from the app store: +- Point it to your Syncthing folder +- Bookmark `latest.html` for instant access + +### Method 3: Create a Home Screen Shortcut (Android) +1. Open `latest.html` in Chrome +2. Menu โ†’ **"Add to Home screen"** +3. Name it "arXiv Digest" +4. Now you have one-tap access! + +--- + +## ๐Ÿงช Testing Your Setup + +### Test the Batch Script: +```batch +# Double-click run_digest.bat or run in Command Prompt: +cd C:\Users\Admin\python\1aResearch +run_digest.bat +``` + +Expected output: +``` +Running arXiv digest... +๐Ÿ” Fetching papers for: Efficient ML / Edge AI + โ†’ Found 5 papers +... +โœจ HTML digest saved to arxiv_archive\arxiv_digest_20251101.html +๐Ÿ“„ Latest digest saved to latest.html + +Generating index page... +๐Ÿ“‘ Index page generated with 1 reports +Done! All files updated. +``` + +### Test Syncthing Sync: +1. Create/edit any file in `C:\Users\Admin\python\1aResearch` +2. Check your phone's Syncthing folder +3. File should appear within seconds + +### Test Task Scheduler: +1. Open Task Scheduler +2. Find "arXiv Daily Digest" +3. Right-click โ†’ **"Run"** +4. Watch it execute + +--- + +## ๐ŸŽจ Customization Ideas + +### Change Run Time: +Edit the Task Scheduler trigger to your preferred time (e.g., 6 AM, 9 PM) + +### Change Number of Papers: +Edit `main.py` line 21: +```python +PAPERS_PER_INTEREST = 10 # Fetch 10 instead of 5 +``` + +### Add More Interest Areas: +Edit `main.py` lines 13-19 and add more queries: +```python +INTERESTS = { + "Your Topic": 'abs:"your keywords" OR ti:"your topic"', + # ... existing topics +} +``` + +### Sync Only HTML Files (Save Space): +In Syncthing โ†’ Folder โ†’ **Ignore Patterns**, add: +``` +!/arxiv_archive/*.html +!/latest.html +!/index.html +* +``` + +--- + +## ๐Ÿ”ง Troubleshooting + +### Task Scheduler doesn't run: +- Check Windows Event Viewer: `Win + X` โ†’ Event Viewer โ†’ Task Scheduler logs +- Ensure "Run whether user is logged on or not" is selected +- Make sure network connection is available + +### Syncthing not syncing: +- Check both devices are connected to the same network (or internet) +- Verify Device IDs match +- Check folder status in Syncthing UI (should say "Up to Date") + +### Python script fails: +- Test manually: `cd C:\Users\Admin\python\1aResearch && venv\Scripts\activate && python main.py` +- Check arXiv API rate limits (3-second delays are built in) +- Ensure internet connection is active + +### Old reports taking up space: +Create a cleanup script to delete reports older than 30 days: +```python +# cleanup_old.py +import os, glob, time +for f in glob.glob("arxiv_archive/*.html"): + if os.path.getmtime(f) < time.time() - 30*86400: + os.remove(f) +``` + +--- + +## ๐ŸŽ‰ You're All Set! + +Every morning at 7 AM: +1. โœ… Script fetches latest papers +2. โœ… Generates beautiful HTML report +3. โœ… Archives it with date +4. โœ… Updates index page +5. โœ… Syncs to your phone +6. โœ… Read cutting-edge research over coffee! + +**Enjoy your automated research digest! ๐Ÿš€** diff --git a/arxiv_digest_20251101.html b/arxiv_digest_20251101.html new file mode 100644 index 0000000..eca5b71 --- /dev/null +++ b/arxiv_digest_20251101.html @@ -0,0 +1,985 @@ + + + + + + arXiv Digest โ€ข 2025-11-01 + + + +
+
+

arXiv Research Digest

+
November 01, 2025 โ€ข 45 papers across 5 interests
+
+
+
+ ๐Ÿ”ฌ +

Efficient ML / Edge AI

+
+
+
+
+ ๐ŸŸข Applied +
+

Inference-Cost-Aware Dynamic Tree Construction for Efficient Inference in Large Language Models

+
๐Ÿ’ก This research tackles the problem of language AI.
+
Large Language Models (LLMs) face significant inference latency challenges stemming from their autoregressive design and large size . To address this, speculative decoding emerges as a solution, enabling the simultaneous generation and validation of multiple tokens .
+ + +
+
+
+ ๐ŸŸข Applied +
+

Distilling Multilingual Vision-Language Models: When Smaller Models Stay Multilingual

+
๐Ÿ’ก This research reduces language AI.
+
Knowledge distillation (KD) demonstrates promising results in transferring knowledge from larger to smaller VLMs . applying KD in multilingualism is an underexplored area . We study five distillation formulations across CLIP and SigLIP2 .
+ + +
+
+
+ ๐ŸŸข Applied +
+

STAR: A Privacy-Preserving, Energy-Efficient Edge AI Framework for Human Activity Recognition via Wi-Fi CSI in Mobile and Pervasive Computing Environments

+
๐Ÿ’ก This research presents techniques for privacy-preserving AI.
+
Human Activity Recognition (HAR) via Wi-Fi Channel State Information (CSI) presents a privacy-preserving, contactless sensing approach suitable for smart homes, healthcare monitoring, and mobile IoT systems .
+ + +
+
+
+ ๐ŸŸข Applied +
+

Do Students Debias Like Teachers? On the Distillability of Bias Mitigation Methods

+
๐Ÿ’ก This research running AI locally on devices for computer vision.
+
Knowledge distillation (KD) is an effective method for model compression and transferring knowledge between models . However, its effect on model's robustness against spurious correlations that degrade performance on out-of-distribution data remains underexplored . This study investigates the effect of knowledge distillation on the transferability of ``debiasing'' capabilities from teacher models to student models .
+ + +
+
+
+ ๐ŸŸข Applied +
+

An Agentic Framework for Rapid Deployment of Edge AI Solutions in Industry 5.0

+
๐Ÿ’ก This research reduces edge computing.
+
We present a novel framework for Industry 5.0 that simplifies the deployment of AI models on edge devices in various industrial settings . The design reduces latency and avoids external data transfer by enabling local inference and real-time processing .
+ + +
+
+
+ ๐ŸŸข Applied +
+

Energy-Efficient Autonomous Driving with Adaptive Perception and Robust Decision

+
๐Ÿ’ก This research explores techniques in machine learning.
+
Autonomous driving is an emerging technology that is expected to bring significant social, economic, and environmental benefits . However, these benefits come with rising energy consumption by computation engines limiting the driving range of vehicles, especially electric ones . Perception computing is typically the most power-intensive component, as it relies on deep learning models to extract environmental features . To address these challenges, we propose an energy-efficient autonomous driving framework, called EneAD .
+ + +
+
+
+ ๐ŸŸข Applied +
+

Resource-Efficient and Robust Inference of Deep and Bayesian Neural Networks on Embedded and Analog Computing Platforms

+
๐Ÿ’ก This research makes more efficient edge computing.
+
While machine learning has transformed numerous application domains, its growing computational demands increasingly constrain scalability and efficiency . In practice, neural networks must not only operate efficiently but also provide reliable predictions under distributional shifts or unseen data . This work advances resource-efficient and robust inference for both conventional and Bayesian neural networks .
+ + +
+
+
+ ๐ŸŸข Applied +
+

UHKD: A Unified Framework for Heterogeneous Knowledge Distillation via Frequency-Domain Representations

+
๐Ÿ’ก This research reduces computer vision.
+
Knowledge distillation (KD) is an effective model compression technique that transfers knowledge from a high-performance teacher to a lightweight student, reducing cost while maintaining accuracy . In visual applications, where large-scale image models are widely used, KD enables efficient deployment .
+ + +
+
+
+ ๐ŸŸข Applied +
+

A Survey on Efficient Vision-Language-Action Models

+
๐Ÿ’ก This research presents techniques for computer vision.
+
Vision-Language-Action models (VLAs) represent a significant frontier in embodied intelligence, aiming to bridge digital knowledge with physical-world interaction . While these models have demonstrated remarkable generalist capabilities, deployment is severely hampered by the substantial computational and data requirements .
+ + +
+
+
+ ๐ŸŸข Applied +
+

Rethinking Inference Placement for Deep Learning across Edge and Cloud Platforms: A Multi-Objective Optimization Perspective and Future Directions

+
๐Ÿ’ก This research running AI locally on devices for language AI.
+
Edge intelligent applications like VR/AR and language model based chatbots have become widespread with the rapid expansion of IoT and mobile devices . But constrained edge devices often cannot serve the increasingly large and complex deep learning (DL) models . Research aims to balance accuracy, computation delay, transmission delay, and privacy concerns .
+ + +
+
+
+
+
+ ๐Ÿ”ฌ +

Privacy-Preserving ML

+
+
+
+
+ ๐ŸŸข Applied +
+

Non-Convex Over-the-Air Heterogeneous Federated Learning: A Bias-Variance Trade-off

+
๐Ÿ’ก This research distributed machine learning across computer vision.
+
Over-the-air (OTA) federated learning (FL) has been well recognized as a scalable paradigm that exploits the waveform superposition of the wireless multiple-access channel . We develop novel OTA-FL SGD updates that allow a structured, time-invariant model bias while facilitating reduced variance updates .
+ + +
+
+
+ ๐ŸŸก Advanced +
+

On Purely Private Covariance Estimation

+
๐Ÿ’ก This research presents techniques for privacy-preserving AI.
+
We present a simple perturbation mechanism for the release of $d-dimensional covariance matrices under pure differential privacy . For large datasets with at least $n\geq d^2/\varepsilon elements, our mechanism recovers the provably optimal Frobenius norm error guarantees of \cite{nikolov2023private}.
+ + +
+
+
+ ๐ŸŸข Applied +
+

Tight Differentially Private PCA via Matrix Coherence

+
๐Ÿ’ก This research makes more efficient privacy-preserving AI.
+
We revisit the task of computing the span of the top $r$ singular vectors $u_1, \ldots, u_r$ of a matrix under differential privacy . We show that a simple and efficient algorithm -- based on singular value decomposition and standard perturbation mechanisms -- returns a private rank-$r$ approximation whose error depends only on the coherence of the input .
+ + +
+
+
+ ๐ŸŸข Applied +
+

UnifiedFL: A Dynamic Unified Learning Framework for Equitable Federation

+
๐Ÿ’ก This research protecting data privacy in privacy-preserving AI.
+
Federated learning (FL) has emerged as a key paradigm for collaborative model training across multiple clients without sharing raw data . We propose UnifiedFL, a dynamic federated learning framework that represents heterogeneous local networks as nodes and edges in a directed model graph optimized by a shared graph neural network .
+ + +
+
+
+ ๐ŸŸข Applied +
+

PEEL: A Poisoning-Exposing Encoding Theoretical Framework for Local Differential Privacy

+
๐Ÿ’ก This research protecting data privacy in privacy-preserving AI.
+
Local Differential Privacy (LDP) is a widely adopted privacy-protection model in the Internet of Things . However, existing defenses either incur prohibitive resource overheads or rely on domain-specific prior knowledge . We propose PEEL, a Poisoning-Exposing Encoding theoretical framework for LDP, which departs from resource- or prior-dependent countermeasures . PEEL amplifies stealthy poisoning effects by re-encoding LDP-perturbed data via sparsification, normalization, and low-rank projection .
+ + +
+
+
+
+
+ ๐Ÿ”ฌ +

Creative AI / Emotion

+
+
+
+
+ ๐ŸŸข Applied +
+

Contribution-Guided Asymmetric Learning for Robust Multimodal Fusion under Imbalance and Noise

+
๐Ÿ’ก This research achieves better emotion AI.
+
Contribution-Guided Asymmetric Learning (CAL) aims to enhance the contribution of high-contribution modalities while compressing weak modalities to increase their contribution . CAL has shown outstanding performance in imbalanced fusion tasks and noise robustness tests . CAL is based on a modality contribution metric W^m combining the information quantity I(m) and confidence D(m).
+ + +
+
+
+ ๐ŸŸข Applied +
+

Lost in Phonation: Voice Quality Variation as an Evaluation Dimension for Speech Foundation Models

+
๐Ÿ’ก This research presents techniques for speech processing.
+
Recent advances in speech foundation models (SFMs) have enabled the direct processing of spoken language from raw audio . This capability allows SFMs to be exposed to rich paralinguistic variations embedded in the input speech signal . One under-explored dimension of this variation is voice quality, encompassing phonation types such as creaky and breathy voice .
+ + +
+
+
+ ๐ŸŸข Applied +
+

Evaluating Emotion Recognition in Spoken Language Models on Emotionally Incongruent Speech

+
๐Ÿ’ก This research achieves better language AI.
+
Advances in spoken language processing have driven the development of spoken language models . We evaluate four SLMs on the task of speech emotion recognition using a dataset of emotionally incongruent speech samples . Results indicate that SLMs rely predominantly on textual semantics rather than speech emotion to perform the task .
+ + +
+
+
+ ๐ŸŸข Applied +
+

MCIHN: A Hybrid Network Model Based on Multi-path Cross-modal Interaction for Multimodal Emotion Recognition

+
๐Ÿ’ก This research understanding emotions in emotion AI.
+
Multimodal emotion recognition is crucial for future human-computer interaction . However accurate emotion recognition still faces significant challenges due to differences between different modalities and the difficulty of characterizing unimodal emotional information . A hybrid network model based on multipath cross-modal interaction (MCIHN) is proposed .
+ + +
+
+
+ ๐ŸŸข Applied +
+

Emotion-Coherent Reasoning for Multimodal LLMs via Emotional Rationale Verifier

+
๐Ÿ’ก This research understanding emotions in language AI.
+
Emotional Rationale Verifier (ERV) and an Explanation Reward are novel approaches to predicting emotions . Authors propose a novel approach: the ERV and an explanation reward . Their method significantly improves explanation-prediction consistency and explanation emotion accuracy .
+ + +
+
+
+ ๐ŸŸข Applied +
+

Emotion Recognition with Minimal Wearable Sensing: Multi-domain Feature, Hybrid Feature Selection, and Personalized vs. Generalized Ensemble Model Analysis

+
๐Ÿ’ก This research proposes a method for edge computing.
+
Negative emotions are linked to the onset of neurodegenerative diseases and dementia . Physiological signals from wearable devices offer a promising noninvasive method for continuous emotion monitoring . The method is designed for deployment in resource-constrained systems, such as Internet of Things .
+ + +
+
+
+ ๐ŸŸข Applied +
+

LUNA: Efficient and Topology-Agnostic Foundation Model for EEG Signal Analysis

+
๐Ÿ’ก This research explores techniques in emotion AI.
+
LUNA (Latent Unified Network Architecture) is a self-supervised foundation model that reconciles disparate electrode geometries while scaling linearly -- not quadratically -- with channel count . LUNA compresses multi-channel EEG into a fixed-size, topology-agnostic latent space via learned queries and cross-attention . It demonstrates highly competitive performance across several benchmarks .
+ + +
+
+
+ ๐ŸŸข Applied +
+

Multi-dataset Joint Pre-training of Emotional EEG Enables Generalizable Affective Computing

+
๐Ÿ’ก This research presents techniques for emotion AI.
+
The method outperforms state-of-the-art large-scale EEG models by an average of 4.57% in AUROC for few-shot emotion recognition and 11.92% in accuracy for zero-shot generalization to a new dataset .
+ + +
+
+
+ ๐ŸŸข Applied +
+

SentiMaithili: A Benchmark Dataset for Sentiment and Reason Generation for the Low-Resource Maithili Language

+
๐Ÿ’ก This research presents techniques for language AI.
+
Maithili is an Indo-Aryan language spoken by more than 13 million people in the Purvanchal region of India . It is valued for its rich linguistic structure and cultural significance .
+ + +
+
+
+ ๐ŸŸข Applied +
+

REVE: A Foundation Model for EEG -- Adapting to Any Setup with Large-Scale Pretraining on 25,000 Subjects

+
๐Ÿ’ก This research reduces computer vision.
+
Foundation models have transformed AI by reducing reliance on task-specific data through large-scale pretraining . While successful in language and vision, their adoption in EEG has lagged due to the heterogeneity of public datasets . Existing EEG foundation models struggle to generalize across these variations, often restricting pretraining to a single setup . We present REVE (Representation for EEG with Versatile Embeddings) a pretrained model .
+ + +
+
+
+
+
+ ๐Ÿ”ฌ +

Lightweight Systems

+
+
+
+
+ ๐ŸŸข Applied +
+

Vectorized Context-Aware Embeddings for GAT-Based Collaborative Filtering

+
๐Ÿ’ก This research enhances language AI.
+
Recommender systems often struggle with data sparsity and cold-start scenarios . This paper presents a Graph Attention Network (GAT) based Collaborative Filtering (CF) framework enhanced with context aware embeddings .
+ + +
+
+
+ ๐ŸŸก Advanced +
+

On neighborhoods of embedded toroidal and Hopf manifolds and their foliations

+
๐Ÿ’ก This research running AI on low-power devices for edge computing.
+
In this article, we give completely new examples of embedded complex manifolds the germ of neighborhood of which is holomorphically equivalent to the zero section in its normal bundle . The first set of examples is composed of connected abelian complex Lie groups, embedded in some complex manifold $M$. The second set is $n$-dimensional Hopf manifolds, embedded as hypersurfaces .
+ + +
+
+
+ ๐ŸŸข Applied +
+

Scales++: Compute Efficient Evaluation Subset Selection with Cognitive Scales Embeddings

+
๐Ÿ’ก This research makes more efficient language AI.
+
The prohibitive cost of evaluating large language models (LLMs) on comprehensive benchmarks necessitates the creation of small yet representative data subsets that enable efficient assessment while retaining predictive fidelity . Current methods for this task operate under a model-centric paradigm, selecting benchmarking items based on the collective performance of existing models . Such approaches are limited by large upfront costs, an inability to immediately handle new benchmarks (`cold-start'), and the fragile assumption that future models will share the failure patterns of their predecessors .
+ + +
+
+
+ ๐ŸŸก Advanced +
+

From Embedding to Control: Representations for Stochastic Multi-Object Systems

+
๐Ÿ’ก This research achieves better machine learning.
+
This paper studies how to achieve accurate modeling and effective control in stochastic nonlinear dynamics with multiple interacting objects . Non-uniform interactions and random topologies make this task challenging .
+ + +
+
+
+ ๐Ÿ”ด Theory-Heavy +
+

Sharp embeddings and existence results for Logarithmic $p$-Laplacian equations with critical growth

+
๐Ÿ’ก This research explores techniques in machine learning.
+
In this paper, we derive a new $p$-Logarithmic Sobolev inequality and optimal continuous and compact embeddings into Orlicz-type spaces of the function space associated with the logarathmic $p$.-Laplacian . By employing the method of the Nehari manifold, we prove the existence of a nontrivial weak solution . We conduct an asymptotic analysis of a weighted nonlocal, nonlinear problem governed by the fractional
+ + +
+
+
+ ๐ŸŸข Applied +
+

Accretion rates of stellar-mass compact objects embedded in AGN discs

+
๐Ÿ’ก This research running AI on low-power devices for edge computing.
+
Stellar-mass compact objects (COs) embedded in active galactic nucleus (AGN) discs are commonly assumed to accrete via Bondi or Bondi-Hoyle-Lyttleton prescriptions . We show that differential rotation in AGN discs can impart non-negligible angular momentum, in which case accretion proceeds through a viscous disc rather than Bondi/BHL flow .
+ + +
+
+
+ ๐ŸŸก Advanced +
+

An explicit formula of the limit of the heat kernel measures on the spheres embedded in $\R^\infty$

+
๐Ÿ’ก This research explores techniques in machine learning.
+
We show that the heat kernel measures based at the north pole of the spheres converges to a Gaussian measure in $R^\infty$ We also find an explicit formula for this measure .
+ + +
+
+
+ ๐ŸŸก Advanced +
+

Tight Spherical Embeddings (Updated Version)

+
๐Ÿ’ก This research explores techniques in machine learning.
+
This is an updated version of a paper which appeared in the proceedings of the 1979 Berlin Colloquium on Global Differential Geometry . The main result of this paper is that every compact isoparametric hypersurface $M^n \subset S^{n+1} is tight .
+ + +
+
+
+ ๐ŸŸข Applied +
+

Enhanced quality factors at resonance in acoustofluidic cavities embedded in matched elastic metamaterials

+
๐Ÿ’ก This research enhances machine learning.
+
We show that by embedding liquid-filled acoustofluidic cavities in a metamaterial, the quality factor of the cavity at selected acoustic resonance modes can be enhanced by 2 to 3 orders of magnitude .
+ + +
+
+
+ ๐ŸŸข Applied +
+

Hierarchical Physics-Embedded Learning for Spatiotemporal Dynamical Systems

+
๐Ÿ’ก This research explores techniques in edge computing.
+
Modeling complex spatiotemporal dynamics, particularly in far-from-equilibrium systems, remains a challenge in science . The governing partial differential equations (PDEs) for these systems are often intractable to derive from first principles .
+ + +
+
+
+
+
+ ๐Ÿ”ฌ +

Offline-First / Local AI

+
+
+
+
+ ๐ŸŸข Applied +
+

SBASH: a Framework for Designing and Evaluating RAG vs. Prompt-Tuned LLM Honeypots

+
๐Ÿ’ก This research explores techniques in language AI.
+
Honeypots are decoy systems used for gathering valuable threat intelligence . Maximising attacker engagement is essential to their utility . Research has highlighted that context-awareness is necessary to increase engagement . Large Language Models (LLMs) have been shown as one approach to increase context awareness .
+ + +
+
+
+ ๐ŸŸข Applied +
+

REx86: A Local Large Language Model for Assisting in x86 Assembly Reverse Engineering

+
๐Ÿ’ก This research improves language AI.
+
Large Language Models offer potential for improving RE efficiency through automated comprehension and commenting . Cloud-hosted, closed-weight models pose privacy and security risks and cannot be used in closed-network facilities . REx86 reduces test-set cross-entropy loss by 64.2% and improves semantic cosine similarity against ground truth by 20.3\% over its base model .
+ + +
+
+
+ ๐ŸŸข Applied +
+

CORE: Reducing UI Exposure in Mobile Agents via Collaboration Between Cloud and Local LLMs

+
๐Ÿ’ก This research achieves better language AI.
+
Mobile agents rely on Large Language Models (LLMs) to plan and execute tasks on smartphone user interfaces . While cloud-based LLMs achieve high task accuracy, they require uploading the full UI state at every step . In contrast, local LLMs avoid UI uploads but suffer from limited capacity, resulting in lower task success rates . CORE comprises three key components: (1) layout-aware block partitioning, (2) Co-planning) and Co-decision-making .
+ + +
+
+
+ ๐ŸŸข Applied +
+

LLM-guided Hierarchical Retrieval

+
๐Ÿ’ก This research explores techniques in language AI.
+
Modern IR systems are increasingly tasked with answering complex, multi-faceted queries that require deep reasoning . We introduce LATTICE, a hierarchical retrieval framework that enables an LLM to reason over and navigate large corpora with logarithmic search complexity .
+ + +
+
+
+ ๐ŸŸข Applied +
+

COSTAR-A: A prompting framework for enhancing Large Language Model performance on Point-of-View questions

+
๐Ÿ’ก This research enhances language AI.
+
COSTAR-A is a novel prompt engineering framework that enhances the existing COSTAR method . COSTAR stands for Context, Objective, Style, Tone, Audience, and Response, by adding the 'Answer' component at the end .
+ + +
+
+
+ ๐ŸŸข Applied +
+

Bridging Semantics & Structure for Software Vulnerability Detection using Hybrid Network Models

+
๐Ÿ’ก This research explores techniques in language AI.
+
Software vulnerabilities remain a persistent risk, yet static and dynamic analyses often overlook structural dependencies that shape insecure behaviors . Viewing programs as heterogeneous graphs, we capture control- and data-flow relations as complex interaction networks . Our hybrid framework combines these graph representations with light-weight (<4B) local LLMs .
+ + +
+
+
+ ๐ŸŸข Applied +
+

Open WebUI: An Open, Extensible, and Usable Interface for AI Interaction

+
๐Ÿ’ก This research presents techniques for language AI.
+
The toolkit is designed to be open (open-source and local), extensible ( plugin support and users can interact with multiple models) The extensibility is enabled through a two-pronged plugin architecture and a community platform for sharing, importing, and adapting extensions .
+ + +
+
+
+ ๐ŸŸข Applied +
+

DualTune: Decoupled Fine-Tuning for On-Device Agentic Systems

+
๐Ÿ’ก This research protecting data privacy in language AI.
+
Large Language Models (LLMs) consistently underperform compared to frontier models in tool calling scenarios . We propose "decoupled fine-tuning" to create dedicated LoRA adapters for tool selection and tool-specific argument generation using separate loss masking for each of the subtasks . DualTune is an inference framework that leverages the LRA adapters created using decoupled fines-tune to perform efficient agent orchestration with the help of local models .
+ + +
+
+
+ ๐ŸŸข Applied +
+

SecureFixAgent: A Hybrid LLM Agent for Automated Python Static Vulnerability Repair

+
๐Ÿ’ก This research automatically finding language AI.
+
Static analysis tools like Bandit are effective at vulnerability detection but suffer from high false positives and lack repair capabilities . Large Language Models (LLMs) can suggest fixes but often hallucinate changes and lack self-validation . We present SecureFixAgent, a hybrid repair framework integrating Bandit with lightweight local LLMs in an iterative detect-repair-validate loop .
+ + +
+
+
+ ๐ŸŸข Applied +
+

PrivWeb: Unobtrusive and Content-aware Privacy Protection For Web Agents

+
๐Ÿ’ก This research protecting data privacy in language AI.
+
PrivWeb is a trusted add-on on web agents that anonymizes private information on interfaces according to user preferences . It features privacy categorization and adaptive notifications that selectively pauses tasks for user control over information collection for highly sensitive information . PrivWeb reduces perceived privacy risks with no associated increase in cognitive effort, and resulted in higher overall satisfaction .
+ + +
+
+
+ +
+ + diff --git a/config.json b/config.json new file mode 100644 index 0000000..d193f9b --- /dev/null +++ b/config.json @@ -0,0 +1,84 @@ +{ + "interests": { + "Efficient ML / Edge AI": { + "query": "cat:cs.LG OR cat:cs.CV OR cat:cs.CL", + "keywords": [ + "efficient", + "edge", + "compression", + "quantization", + "pruning", + "distillation", + "inference", + "lightweight", + "mobile", + "accelerat" + ] + }, + "Privacy-Preserving ML": { + "query": "cat:cs.CR OR cat:cs.LG", + "keywords": [ + "privacy", + "federated", + "differential", + "secure", + "encrypted", + "confidential", + "private", + "anonymi" + ] + }, + "Creative AI / Emotion": { + "query": "cat:cs.AI OR cat:cs.SD OR cat:cs.HC", + "keywords": [ + "emotion", + "generative", + "creative", + "music", + "affective", + "sentiment", + "art", + "design", + "audio", + "synthesis" + ] + }, + "Lightweight Systems": { + "query": "cat:cs.DC OR cat:cs.AR", + "keywords": [ + "embedded", + "iot", + "edge", + "resource", + "constrained", + "microcontroller", + "low-power", + "sensor", + "device" + ] + }, + "Offline-First / Local AI": { + "query": "cat:cs.LG", + "keywords": [ + "local", + "device", + "mobile", + "offline", + "on-device", + "edge", + "browser", + "client-side", + "standalone" + ] + } + }, + "settings": { + "papers_per_interest": 10, + "summary_max_length": 160, + "recent_days": 7, + "fallback_days": 90, + "min_papers_threshold": 5, + "fetch_multiplier": 5, + "user_agent": "ResearchDigestBot/1.0 (github.com/wedsmoker)" + } +} diff --git a/desktop_demo.png b/desktop_demo.png new file mode 100644 index 0000000..9303d74 Binary files /dev/null and b/desktop_demo.png differ diff --git a/generate_index.py b/generate_index.py new file mode 100644 index 0000000..865a6c5 --- /dev/null +++ b/generate_index.py @@ -0,0 +1,205 @@ +"""Generate an index.html page to browse all archived digests.""" +import os +from datetime import datetime +import glob + +def generate_index(): + archive_dir = "arxiv_archive" + + # Get all digest files + if os.path.exists(archive_dir): + digest_files = sorted(glob.glob(os.path.join(archive_dir, "arxiv_digest_*.html")), reverse=True) + else: + digest_files = [] + + # Parse dates and create entries + entries = [] + for filepath in digest_files: + filename = os.path.basename(filepath) + # Extract date from filename: arxiv_digest_20251101.html + date_str = filename.replace("arxiv_digest_", "").replace(".html", "") + try: + date_obj = datetime.strptime(date_str, "%Y%m%d") + formatted_date = date_obj.strftime("%B %d, %Y") + day_of_week = date_obj.strftime("%A") + entries.append({ + 'filename': filename, + 'date': formatted_date, + 'day': day_of_week, + 'date_obj': date_obj + }) + except ValueError: + continue + + html = f""" + + + + + arXiv Digest Archive + + + +
+
+

๐Ÿ“š arXiv Digest Archive

+

Browse your daily research digests

+
+ +
+ ๐Ÿ“ฐ View Latest Digest +
+ +

Past Reports

+""" + + if entries: + html += ' \n' + else: + html += '
No archived reports yet. Run the digest script to generate your first report!
\n' + + html += f""" +
+ {len(entries)} report{"s" if len(entries) != 1 else ""} archived โ€ข Updated {datetime.now().strftime("%B %d, %Y at %I:%M %p")} +
+
+ + +""" + + with open("index.html", 'w', encoding='utf-8') as f: + f.write(html) + print(f"๐Ÿ“‘ Index page generated with {len(entries)} reports") + +if __name__ == "__main__": + generate_index() diff --git a/generate_tiktok_feed.py b/generate_tiktok_feed.py new file mode 100644 index 0000000..646e9ed --- /dev/null +++ b/generate_tiktok_feed.py @@ -0,0 +1,512 @@ +import json +import random +from datetime import datetime + +def interleave_papers_by_interest(all_papers_by_interest): + """ + Interleave papers round-robin style across interests. + Returns a flat list cycling through: Interest1[0], Interest2[0], ..., Interest1[1], Interest2[1], ... + """ + # Shuffle papers within each interest category + for interest_name in all_papers_by_interest: + random.shuffle(all_papers_by_interest[interest_name]) + + # Interleave round-robin + interleaved = [] + interest_names = list(all_papers_by_interest.keys()) + max_papers = max(len(papers) for papers in all_papers_by_interest.values()) if all_papers_by_interest else 0 + + for i in range(max_papers): + for interest_name in interest_names: + papers = all_papers_by_interest[interest_name] + if i < len(papers): + # Add interest category to paper data + papers[i]['interest_category'] = interest_name + interleaved.append(papers[i]) + + return interleaved + +def generate_tiktok_html(interleaved_papers): + """Generate self-contained TikTok-style feed HTML with embedded data.""" + + papers_json = json.dumps(interleaved_papers, indent=2, ensure_ascii=False) + date_str = datetime.now().strftime('%B %d, %Y') + + html = f""" + + + + + Research Feed โ€ข {date_str} + + + + +
+ + +
+ +
+ +
+ โ™ก +
+ +
+ โ†“ Scroll to explore +
+ + + + +""" + + return html + +def save_tiktok_feed(all_papers_by_interest, filename='tiktok_feed.html'): + """ + Generate and save TikTok-style feed from papers data. + Called by main.py after fetching papers. + """ + # Interleave papers round-robin + interleaved = interleave_papers_by_interest(all_papers_by_interest) + print(f"\n๐Ÿ”„ Interleaved {len(interleaved)} papers across {len(all_papers_by_interest)} interests") + + # Generate HTML + html = generate_tiktok_html(interleaved) + + # Save file + with open(filename, 'w', encoding='utf-8') as f: + f.write(html) + + print(f"โœจ TikTok feed saved to {filename}") + print("๐Ÿ“ฑ Sync with your phone and open in browser!") diff --git a/main.py b/main.py new file mode 100644 index 0000000..ac6348d --- /dev/null +++ b/main.py @@ -0,0 +1,724 @@ +import os +import time +import json +import xml.etree.ElementTree as ET +import requests +from transformers import pipeline +from datetime import datetime, timedelta +from generate_tiktok_feed import save_tiktok_feed + +# ====================== +# CONFIGURATION +# ====================== + +def load_config(): + """Load configuration from config.json file.""" + config_file = "config.json" + + # Default configuration (fallback) + default_config = { + "interests": { + "Efficient ML / Edge AI": { + "query": 'cat:cs.LG OR cat:cs.CV OR cat:cs.CL', + "keywords": ['efficient', 'edge', 'compression', 'quantization', 'pruning', 'distillation', 'inference', 'lightweight', 'mobile', 'accelerat'] + } + }, + "settings": { + "papers_per_interest": 10, + "summary_max_length": 160, + "recent_days": 7, + "fallback_days": 90, + "min_papers_threshold": 5, + "fetch_multiplier": 5, + "user_agent": "ResearchDigestBot/1.0 (github.com/wedsmoker)" + } + } + + if os.path.exists(config_file): + try: + with open(config_file, 'r', encoding='utf-8') as f: + config = json.load(f) + print(f"โœ… Loaded configuration from {config_file}") + return config + except Exception as e: + print(f"โš ๏ธ Error loading config file: {e}. Using defaults.") + return default_config + else: + print(f"โš ๏ธ {config_file} not found. Using default configuration.") + return default_config + +# Load configuration +config = load_config() +INTERESTS = config.get('interests', {}) +settings = config.get('settings', {}) + +PAPERS_PER_INTEREST = settings.get('papers_per_interest', 10) +SUMMARY_MAX_LENGTH = settings.get('summary_max_length', 160) +USER_AGENT = settings.get('user_agent', 'ResearchDigestBot/1.0') + +# Date filtering: Only fetch papers from the last N days (set to 0 to disable) +RECENT_DAYS = settings.get('recent_days', 7) +FALLBACK_DAYS = settings.get('fallback_days', 90) +MIN_PAPERS_THRESHOLD = settings.get('min_papers_threshold', 5) +FETCH_MULTIPLIER = settings.get('fetch_multiplier', 5) + +# Deduplication: Track papers we've already shown +SEEN_PAPERS_FILE = "seen_papers.json" + +# Initialize summarizer (optional) +try: + summarizer = pipeline( + "summarization", + model="sshleifer/distilbart-cnn-12-6", + device=-1 + ) +except Exception as e: + print(f"โš ๏ธ Summarizer unavailable ({e}). Using raw abstracts.") + summarizer = None + +# ====================== +# DEDUPLICATION HELPERS +# ====================== + +def load_seen_papers(): + """Load the set of previously seen paper IDs.""" + if os.path.exists(SEEN_PAPERS_FILE): + try: + with open(SEEN_PAPERS_FILE, 'r') as f: + data = json.load(f) + return set(data.get('seen_ids', [])) + except Exception as e: + print(f"โš ๏ธ Error loading seen papers: {e}") + return set() + +def save_seen_papers(seen_ids): + """Save the set of seen paper IDs.""" + try: + with open(SEEN_PAPERS_FILE, 'w') as f: + json.dump({ + 'seen_ids': list(seen_ids), + 'last_updated': datetime.now().isoformat() + }, f, indent=2) + except Exception as e: + print(f"โš ๏ธ Error saving seen papers: {e}") + +def get_date_filter(days=None): + """Generate date filter for arXiv query (last N days).""" + if days is None: + days = RECENT_DAYS + + if days <= 0: + return "" + + end_date = datetime.now() + start_date = end_date - timedelta(days=days) + + # arXiv date format: YYYYMMDD0000 to YYYYMMDD2359 + date_filter = f"submittedDate:[{start_date.strftime('%Y%m%d')}0000 TO {end_date.strftime('%Y%m%d')}2359]" + return date_filter + +# ====================== +# ARXIV FETCH & PARSE +# ====================== + +def fetch_arxiv_papers(query, max_results=5, days_back=None): + url = "http://export.arxiv.org/api/query" + + # Add date filter if configured + date_filter = get_date_filter(days_back) + if date_filter: + # Combine user query with date filter using AND + query = f"({query}) AND {date_filter}" + + params = { + "search_query": query, + "start": 0, + "max_results": max_results, + "sortBy": "submittedDate", + "sortOrder": "descending" + } + headers = {"User-Agent": USER_AGENT} + try: + response = requests.get(url, params=params, headers=headers, timeout=20) + response.raise_for_status() + return response.text + except Exception as e: + print(f"โŒ Error fetching query '{query}': {e}") + return None + +def parse_papers(xml_data): + if not xml_data: + return [] + try: + root = ET.fromstring(xml_data) + except ET.ParseError: + return [] + + namespace = {'atom': 'http://www.w3.org/2005/Atom'} + papers = [] + + for entry in root.findall('atom:entry', namespace): + title_elem = entry.find('atom:title', namespace) + summary_elem = entry.find('atom:summary', namespace) + id_elem = entry.find('atom:id', namespace) + published_elem = entry.find('atom:published', namespace) + + if None in (title_elem, summary_elem, id_elem): + continue + + title = ' '.join(title_elem.text.strip().split()) + summary = ' '.join(summary_elem.text.strip().split()) + link = id_elem.text + published = published_elem.text.split('T')[0] if published_elem is not None else "Unknown" + + # Extract arXiv ID + arxiv_id = link.split('/abs/')[-1].split('v')[0] + + # Get primary category + primary_cat_elem = entry.find('.//{http://arxiv.org/schemas/atom}primary_category') + category = primary_cat_elem.get('term') if primary_cat_elem is not None else "unknown" + + papers.append({ + 'title': title, + 'summary': summary, + 'link': link, + 'pdf_link': f"https://arxiv.org/pdf/{arxiv_id}.pdf", + 'arxiv_id': arxiv_id, + 'category': category, + 'published': published + }) + return papers + +def summarize_abstract(abstract): + if summarizer is None: + return abstract[:SUMMARY_MAX_LENGTH] + ("..." if len(abstract) > SUMMARY_MAX_LENGTH else "") + try: + if len(abstract.split()) < 15: + return abstract + result = summarizer( + abstract, + max_length=min(SUMMARY_MAX_LENGTH, 142), + min_length=30, + truncation=True + ) + return result[0]['summary_text'] + except Exception as e: + return abstract[:SUMMARY_MAX_LENGTH] + "..." + +def calculate_relevance_score(paper, keywords): + """Calculate relevance score based on keyword matches in title and abstract.""" + title_lower = paper['title'].lower() + abstract_lower = paper['summary'].lower() + + score = 0 + matched_keywords = [] + + for keyword in keywords: + keyword_lower = keyword.lower() + # Title matches are worth more + if keyword_lower in title_lower: + score += 3 + matched_keywords.append(keyword) + # Abstract matches + elif keyword_lower in abstract_lower: + score += 1 + matched_keywords.append(keyword) + + # Bonus for multiple keyword matches + if len(matched_keywords) > 2: + score += len(matched_keywords) - 2 + + paper['relevance_score'] = score + paper['matched_keywords'] = matched_keywords + return score + +def estimate_difficulty(abstract, category): + """Estimate paper difficulty using heuristic keyword analysis.""" + abstract_lower = abstract.lower() + + # Theory-heavy indicators + complexity_words = ['theoretical', 'proof', 'theorem', 'convergence', 'optimal', + 'asymptotic', 'lemma', 'proposition', 'rigorous', 'formalism'] + + # Applied/practical indicators + applied_words = ['system', 'framework', 'application', 'dataset', 'benchmark', + 'implementation', 'experiment', 'empirical', 'practical'] + + # Math-heavy categories + math_categories = ['math.', 'stat.', 'quant-ph'] + + # Calculate score + score = sum(1 for w in complexity_words if w in abstract_lower) + score -= sum(0.5 for w in applied_words if w in abstract_lower) + + # Category bonus + if any(cat in category for cat in math_categories): + score += 1 + + # Determine difficulty level + if score > 2: + return "๐Ÿ”ด Theory-Heavy" + elif score > 0.5: + return "๐ŸŸก Advanced" + else: + return "๐ŸŸข Applied" + +def generate_layman_context(title, abstract): + """Generate simple layman explanation using keyword extraction and templates.""" + abstract_lower = abstract.lower() + + # Extract key action words and concepts + action_map = { + 'improv': 'improves', + 'reduc': 'reduces', + 'enhanc': 'enhances', + 'optimi': 'optimizes', + 'acceler': 'speeds up', + 'efficient': 'makes more efficient', + 'novel': 'introduces a new approach to', + 'outperform': 'works better than existing methods for', + 'achiev': 'achieves better', + 'propose': 'proposes a method for', + 'present': 'presents techniques for', + 'address': 'tackles the problem of', + 'privacy': 'protecting data privacy in', + 'federated': 'distributed machine learning across', + 'emotion': 'understanding emotions in', + 'embedded': 'running AI on low-power devices for', + 'edge': 'running AI locally on devices for', + 'compression': 'making models smaller for', + 'inference': 'faster predictions in', + 'generative': 'creating new content with', + 'detection': 'automatically finding', + 'classification': 'categorizing', + 'prediction': 'forecasting' + } + + # Find first matching action + action = "explores techniques in" + for keyword, phrase in action_map.items(): + if keyword in abstract_lower[:300]: # Check first part of abstract + action = phrase + break + + # Extract domain + domain = "machine learning" + if "language model" in abstract_lower or "llm" in abstract_lower or "nlp" in abstract_lower: + domain = "language AI" + elif "vision" in abstract_lower or "image" in abstract_lower or "visual" in abstract_lower: + domain = "computer vision" + elif "speech" in abstract_lower or "audio" in abstract_lower: + domain = "speech processing" + elif "privacy" in abstract_lower or "federated" in abstract_lower: + domain = "privacy-preserving AI" + elif "edge" in abstract_lower or "embedded" in abstract_lower or "device" in abstract_lower: + domain = "edge computing" + elif "emotion" in abstract_lower or "affective" in abstract_lower: + domain = "emotion AI" + + return f"This research {action} {domain}." + +# ====================== +# HTML OUTPUT +# ====================== + +def save_html_digest(all_papers_by_interest, filename=None): + # Create archive directory if it doesn't exist + archive_dir = "arxiv_archive" + if not os.path.exists(archive_dir): + os.makedirs(archive_dir) + + if filename is None: + date_str = datetime.now().strftime('%Y%m%d') + filename = os.path.join(archive_dir, f"arxiv_digest_{date_str}.html") + + # Also save as latest.html for easy syncing + latest_file = "latest.html" + + html = f""" + + + + + arXiv Digest โ€ข {datetime.now().strftime('%Y-%m-%d')} + + + +
+
+

arXiv Research Digest

+
{datetime.now().strftime('%B %d, %Y')} โ€ข {sum(len(p) for p in all_papers_by_interest.values())} papers across {len(all_papers_by_interest)} interests
+
+""" + + for interest_name, papers in all_papers_by_interest.items(): + html += f"""
+
+ ๐Ÿ”ฌ +

{interest_name}

+
+""" + if not papers: + html += '

No recent papers found.

\n' + else: + html += '
\n' + for paper in papers: + html += f"""
+
+ {paper['difficulty']} +
+

{paper['title']}

+
๐Ÿ’ก {paper['layman']}
+
{paper['summary']}
+ + +
+""" + html += '
\n' + html += "
\n" + + html += """ +
+ + +""" + # Save archived version + with open(filename, 'w', encoding='utf-8') as f: + f.write(html) + print(f"โœจ HTML digest saved to {filename}") + + # Also save as latest.html for quick access + with open(latest_file, 'w', encoding='utf-8') as f: + f.write(html) + print(f"๐Ÿ“„ Latest digest saved to {latest_file}") + +# ====================== +# MAIN EXECUTION +# ====================== + +if __name__ == "__main__": + # Load previously seen papers + seen_papers = load_seen_papers() + print(f"๐Ÿ“‹ Loaded {len(seen_papers)} previously seen papers") + + if RECENT_DAYS > 0: + print(f"๐Ÿ“… Fetching papers from last {RECENT_DAYS} days") + else: + print("๐Ÿ“… Fetching all available papers (no date filter)") + + all_papers = {} + new_papers_count = 0 + duplicate_count = 0 + + for interest_name, interest_config in INTERESTS.items(): + query = interest_config['query'] + keywords = interest_config['keywords'] + + print(f"\n๐Ÿ” Fetching papers for: {interest_name}") + xml_data = fetch_arxiv_papers(query, PAPERS_PER_INTEREST * FETCH_MULTIPLIER) # Fetch more to filter + papers = parse_papers(xml_data) if xml_data else [] + + print(f" โ†’ Found {len(papers)} papers") + + # Filter out duplicates and calculate relevance + fresh_papers = [] + for p in papers: + if p['arxiv_id'] not in seen_papers: + # Store original abstract for analysis + original_abstract = p['summary'] + + # Calculate relevance score FIRST (before summarization) + calculate_relevance_score(p, keywords) + + # Estimate difficulty level (use ORIGINAL abstract before summarization) + p['difficulty'] = estimate_difficulty(original_abstract, p['category']) + + # Generate layman context (use ORIGINAL abstract for better keyword extraction) + p['layman'] = generate_layman_context(p['title'], original_abstract) + + # Generate summary (do this last to avoid losing original abstract) + p['summary'] = summarize_abstract(original_abstract) + + fresh_papers.append(p) + else: + duplicate_count += 1 + + # Sort by relevance score (highest first) + fresh_papers.sort(key=lambda x: x['relevance_score'], reverse=True) + + # Take top N papers + top_papers = fresh_papers[:PAPERS_PER_INTEREST] + + # Mark these papers as seen + for p in top_papers: + seen_papers.add(p['arxiv_id']) + new_papers_count += 1 + + all_papers[interest_name] = top_papers + print(f" โœจ {len(top_papers)} new papers (from {len(fresh_papers)} candidates, skipped {len(papers) - len(fresh_papers)} duplicates)") + if top_papers: + print(f" ๐Ÿ“Š Relevance scores: {[p['relevance_score'] for p in top_papers]}") + + # FALLBACK: If we didn't get enough papers, try wider date range (only 1 extra request) + if len(top_papers) < MIN_PAPERS_THRESHOLD and FALLBACK_DAYS > RECENT_DAYS: + print(f" ๐Ÿ”„ Low yield, trying fallback search (last {FALLBACK_DAYS} days)...") + time.sleep(3) # Respect rate limit before fallback request + + xml_data_fallback = fetch_arxiv_papers(query, PAPERS_PER_INTEREST * FETCH_MULTIPLIER, days_back=FALLBACK_DAYS) + papers_fallback = parse_papers(xml_data_fallback) if xml_data_fallback else [] + + print(f" โ†’ Found {len(papers_fallback)} papers in fallback") + + # Process fallback papers + fallback_fresh = [] + for p in papers_fallback: + if p['arxiv_id'] not in seen_papers: + original_abstract = p['summary'] + calculate_relevance_score(p, keywords) + p['difficulty'] = estimate_difficulty(original_abstract, p['category']) + p['layman'] = generate_layman_context(p['title'], original_abstract) + p['summary'] = summarize_abstract(original_abstract) + fallback_fresh.append(p) + + # Sort fallback papers by relevance + fallback_fresh.sort(key=lambda x: x['relevance_score'], reverse=True) + + # Add top fallback papers to fill quota + needed = PAPERS_PER_INTEREST - len(top_papers) + additional_papers = fallback_fresh[:needed] + + for p in additional_papers: + seen_papers.add(p['arxiv_id']) + new_papers_count += 1 + + top_papers.extend(additional_papers) + all_papers[interest_name] = top_papers + print(f" โœจ After fallback: {len(top_papers)} total papers") + + # Be kind: 3-second delay between queries (arXiv recommendation) + time.sleep(3) + + # Save updated seen papers + save_seen_papers(seen_papers) + + print(f"\n๐Ÿ“Š Summary:") + print(f" โ€ข Total new papers: {new_papers_count}") + print(f" โ€ข Total duplicates skipped: {duplicate_count}") + print(f" โ€ข Total tracked papers: {len(seen_papers)}") + + save_html_digest(all_papers) + save_tiktok_feed(all_papers) + print("\nโœ… Done! Open the HTML files in your browser.") \ No newline at end of file diff --git a/mobile_demo.png b/mobile_demo.png new file mode 100644 index 0000000..6ac63bf Binary files /dev/null and b/mobile_demo.png differ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..590e429 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +transformers==4.46.2 +torch==2.5.1 +torchvision==0.20.1 +requests==2.32.3 \ No newline at end of file diff --git a/reset_seen_papers.py b/reset_seen_papers.py new file mode 100644 index 0000000..f7b96e9 --- /dev/null +++ b/reset_seen_papers.py @@ -0,0 +1,17 @@ +""" +Reset the seen_papers.json file to start fresh. +Run this if you want to see papers again that were previously shown. +""" +import os +import json + +SEEN_PAPERS_FILE = "seen_papers.json" + +if os.path.exists(SEEN_PAPERS_FILE): + # Backup old file + backup_file = SEEN_PAPERS_FILE.replace('.json', '_backup.json') + os.rename(SEEN_PAPERS_FILE, backup_file) + print(f"โœ… Backed up old file to {backup_file}") + print(f"โœ… Reset complete! Next run will show all papers as fresh.") +else: + print("โ„น๏ธ No seen_papers.json file found. Nothing to reset.") diff --git a/run_digest.bat b/run_digest.bat new file mode 100644 index 0000000..68097b5 --- /dev/null +++ b/run_digest.bat @@ -0,0 +1,50 @@ +@echo off +REM ArXiv Digest Runner - Sets up environment and runs the script +cd /d "%~dp0" + +REM Check if virtual environment exists +if not exist "venv\" ( + echo Virtual environment not found. Creating one... + python -m venv venv + if errorlevel 1 ( + echo Error creating virtual environment! + echo Make sure Python is installed and available in PATH. + pause + exit /b 1 + ) + echo Virtual environment created successfully. + + echo Installing dependencies... + call venv\Scripts\activate.bat + python -m pip install --upgrade pip + pip install -r requirements.txt + if errorlevel 1 ( + echo Error installing dependencies! + pause + exit /b 1 + ) + echo Dependencies installed successfully. +) else ( + call venv\Scripts\activate.bat +) + +echo Running arXiv digest... +python main.py +if errorlevel 1 ( + echo Error running main script! + pause + goto :end +) + +echo Generating index page... +python generate_index.py +if errorlevel 1 ( + echo Error generating index! + pause + goto :end +) + +echo Done! All files updated. +pause +:end +deactivate diff --git a/tiktok_feed.html b/tiktok_feed.html new file mode 100644 index 0000000..fccb3fa --- /dev/null +++ b/tiktok_feed.html @@ -0,0 +1,1304 @@ + + + + + + Research Feed โ€ข November 05, 2025 + + + + +
+ + +
+ +
+ +
+ โ™ก +
+ +
+ โ†“ Scroll to explore +
+ + + +