refactor(collect): replace bash scripts with Borg CLI

Remove custom collection scripts in favour of Borg
(github.com/Snider/Borg) for data collection. Skills now document
what to collect, with Borg handling the actual collection.

Removed:
- collect-whitepaper.sh, dispatch.sh, update-index.sh
- All skill-specific bash scripts (collect.sh, generate-jobs.sh, etc.)
- hooks.json and HOOKS.md

Updated:
- plugin.json to reference Borg dependency
- SKILL.md files with Borg command examples

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Snider 2026-02-02 00:04:24 +00:00
parent 35260ed49e
commit 3c3d3de1a1
20 changed files with 108 additions and 2760 deletions

View file

@ -1,6 +1,6 @@
{
"name": "collect",
"description": "Data collection skills for cryptocurrency research - whitepapers, forum archives, project archaeology, and blockchain history",
"description": "Data collection skills using Borg CLI - whitepapers, forum archives, project archaeology, and blockchain history",
"version": "0.1.0",
"author": {
"name": "Host UK",
@ -14,10 +14,14 @@
"license": "EUPL-1.2",
"keywords": [
"data-collection",
"borg",
"cryptocurrency",
"archive",
"whitepapers",
"blockchain",
"research"
]
],
"dependencies": {
"borg": "github.com/Snider/Borg@v0.1.0"
}
}

View file

@ -1,90 +0,0 @@
# Collection Hooks
Event-driven hooks that trigger during data collection.
## Available Hooks
| Hook | Trigger | Purpose |
|------|---------|---------|
| `collect-whitepaper.sh` | PDF/paper URL detected | Auto-queue whitepapers |
| `on-github-release.sh` | Release found | Archive release metadata |
| `on-explorer-block.sh` | Block data fetched | Index blockchain data |
## Hook Events
### `on_url_found`
Fired when a new URL is discovered during collection.
```bash
# Pattern matching
*.pdf → collect-whitepaper.sh
*/releases/* → on-github-release.sh
*/api/block/* → on-explorer-block.sh
```
### `on_file_collected`
Fired after a file is successfully downloaded.
```bash
# Post-processing
*.json → validate-json.sh
*.html → extract-links.sh
*.pdf → extract-metadata.sh
```
### `on_collection_complete`
Fired when a job batch finishes.
```bash
# Reporting
→ generate-index.sh
→ update-registry.sh
```
## Plugin Integration
For the marketplace plugin system:
```json
{
"name": "whitepaper-collector",
"version": "1.0.0",
"hooks": {
"on_url_found": {
"pattern": "*.pdf",
"handler": "./collect-whitepaper.sh"
}
}
}
```
## Registration
Hooks register in `hooks.json`:
```json
{
"on_url_found": [
{
"pattern": "\\.pdf$",
"handler": "./hooks/collect-whitepaper.sh",
"priority": 10
}
]
}
```
## Usage in Collectors
Collectors call hooks via:
```bash
# In job-collector/process.sh
source ./hooks/dispatch.sh
# When URL found
dispatch_hook "on_url_found" "$URL"
# When file collected
dispatch_hook "on_file_collected" "$FILE" "$TYPE"
```

View file

@ -1,59 +0,0 @@
#!/usr/bin/env bash
# Hook: collect-whitepaper.sh
# Called when a whitepaper URL is detected during collection
# Usage: ./collect-whitepaper.sh <URL> [destination-folder]
set -e
URL="$1"
DEST="${2:-./whitepapers}"
if [ -z "$URL" ]; then
echo "Usage: $0 <url> [destination]" >&2
exit 1
fi
# Detect paper type from URL
detect_category() {
local url="$1"
case "$url" in
*cryptonote*) echo "cryptonote" ;;
*iacr.org*|*eprint*) echo "research" ;;
*arxiv.org*) echo "research" ;;
*monero*|*getmonero*) echo "research" ;;
*lethean*|*lthn*) echo "lethean" ;;
*) echo "uncategorized" ;;
esac
}
# Generate safe filename from URL
safe_filename() {
local url="$1"
basename "$url" | sed 's/[^a-zA-Z0-9._-]/-/g'
}
CATEGORY=$(detect_category "$URL")
FILENAME=$(safe_filename "$URL")
TARGET_DIR="$DEST/$CATEGORY"
TARGET_FILE="$TARGET_DIR/$FILENAME"
mkdir -p "$TARGET_DIR"
# Check if already collected
if [ -f "$TARGET_FILE" ]; then
echo "Already collected: $TARGET_FILE"
exit 0
fi
echo "Collecting whitepaper:"
echo " URL: $URL"
echo " Category: $CATEGORY"
echo " Destination: $TARGET_FILE"
# Create job entry for proxy collection
echo "$URL|$FILENAME|whitepaper|category=$CATEGORY" >> "$DEST/.pending-jobs.txt"
echo "Job queued: $DEST/.pending-jobs.txt"
echo ""
echo "To collect immediately (if you have direct access):"
echo " curl -L -o '$TARGET_FILE' '$URL'"

View file

@ -1,80 +0,0 @@
#!/usr/bin/env bash
# Hook dispatcher - source this in collectors
# Usage: source ./hooks/dispatch.sh
HOOKS_DIR="$(dirname "${BASH_SOURCE[0]}")"
HOOKS_JSON="$HOOKS_DIR/hooks.json"
# Dispatch a hook event
# dispatch_hook <event> <arg1> [arg2] ...
dispatch_hook() {
local event="$1"
shift
local args=("$@")
if [ ! -f "$HOOKS_JSON" ]; then
return 0
fi
# Get handlers for this event (requires jq)
if ! command -v jq &> /dev/null; then
echo "Warning: jq not installed, hooks disabled" >&2
return 0
fi
local handlers
handlers=$(jq -r ".hooks[\"$event\"][]? | select(.enabled == true) | @json" "$HOOKS_JSON" 2>/dev/null)
if [ -z "$handlers" ]; then
return 0
fi
echo "$handlers" | while read -r handler_json; do
local name pattern handler_script priority
name=$(echo "$handler_json" | jq -r '.name')
pattern=$(echo "$handler_json" | jq -r '.pattern // ""')
handler_script=$(echo "$handler_json" | jq -r '.handler')
# Check pattern match if pattern exists
if [ -n "$pattern" ] && [ -n "${args[0]}" ]; then
if ! echo "${args[0]}" | grep -qE "$pattern"; then
continue
fi
fi
# Execute handler
local full_path="$HOOKS_DIR/$handler_script"
if [ -x "$full_path" ]; then
echo "[hook] $name: ${args[*]}" >&2
"$full_path" "${args[@]}"
elif [ -f "$full_path" ]; then
echo "[hook] $name: ${args[*]}" >&2
bash "$full_path" "${args[@]}"
fi
done
}
# Register a new hook dynamically
# register_hook <event> <name> <pattern> <handler>
register_hook() {
local event="$1"
local name="$2"
local pattern="$3"
local handler="$4"
if ! command -v jq &> /dev/null; then
echo "Error: jq required for hook registration" >&2
return 1
fi
local new_hook
new_hook=$(jq -n \
--arg name "$name" \
--arg pattern "$pattern" \
--arg handler "$handler" \
'{name: $name, pattern: $pattern, handler: $handler, priority: 50, enabled: true}')
# Add to hooks.json
jq ".hooks[\"$event\"] += [$new_hook]" "$HOOKS_JSON" > "$HOOKS_JSON.tmp" \
&& mv "$HOOKS_JSON.tmp" "$HOOKS_JSON"
}

View file

@ -1,45 +0,0 @@
{
"version": "1.0.0",
"hooks": {
"on_url_found": [
{
"name": "whitepaper-collector",
"pattern": "\\.pdf$",
"handler": "./collect-whitepaper.sh",
"priority": 10,
"enabled": true
},
{
"name": "whitepaper-iacr",
"pattern": "eprint\\.iacr\\.org",
"handler": "./collect-whitepaper.sh",
"priority": 10,
"enabled": true
},
{
"name": "whitepaper-arxiv",
"pattern": "arxiv\\.org",
"handler": "./collect-whitepaper.sh",
"priority": 10,
"enabled": true
}
],
"on_file_collected": [
{
"name": "pdf-metadata",
"pattern": "\\.pdf$",
"handler": "./extract-pdf-metadata.sh",
"priority": 5,
"enabled": false
}
],
"on_collection_complete": [
{
"name": "update-index",
"handler": "./update-index.sh",
"priority": 100,
"enabled": true
}
]
}
}

View file

@ -1,269 +0,0 @@
#!/usr/bin/env bash
# BitcoinTalk Thread Collector
# Usage: ./collect.sh <topic-id-or-url> [--pages=N] [--output=DIR]
set -e
DELAY=2 # Be respectful to BTT servers
MAX_PAGES=0 # 0 = all pages
OUTPUT_BASE="."
# Parse topic ID from URL or direct input
parse_topic_id() {
local input="$1"
if [[ "$input" =~ topic=([0-9]+) ]]; then
echo "${BASH_REMATCH[1]}"
else
echo "$input" | grep -oE '[0-9]+'
fi
}
# Fetch a single page
fetch_page() {
local topic_id="$1"
local offset="$2"
local output_file="$3"
local url="https://bitcointalk.org/index.php?topic=${topic_id}.${offset}"
echo " Fetching: $url"
curl -s -A "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)" \
-H "Accept: text/html" \
"$url" > "$output_file"
sleep $DELAY
}
# Check if page has posts
page_has_posts() {
local html_file="$1"
grep -q 'class="post"' "$html_file" 2>/dev/null
}
# Get last page number from first page
get_last_page() {
local html_file="$1"
# Look for navigation like "Pages: [1] 2 3 ... 50"
local max_page=$(grep -oE 'topic=[0-9]+\.[0-9]+' "$html_file" | \
sed 's/.*\.//' | sort -rn | head -1)
echo "${max_page:-0}"
}
# Extract posts from HTML (simplified - works for basic extraction)
extract_posts_simple() {
local html_file="$1"
local output_dir="$2"
local post_offset="$3"
# Use Python for reliable HTML parsing
python3 << PYEOF
import re
import html
import os
from datetime import datetime
html_content = open('$html_file', 'r', encoding='utf-8', errors='ignore').read()
# Pattern to find posts - BTT structure
post_pattern = r'<td class="td_headerandpost">(.*?)</td>\s*</tr>\s*</table>\s*</td>\s*</tr>'
author_pattern = r'<a href="https://bitcointalk\.org/index\.php\?action=profile;u=\d+"[^>]*>([^<]+)</a>'
date_pattern = r'<div class="smalltext">([A-Za-z]+ \d+, \d+, \d+:\d+:\d+ [AP]M)</div>'
post_content_pattern = r'<div class="post"[^>]*>(.*?)</div>\s*(?:<div class="moderatorbar"|</td>)'
posts = re.findall(post_pattern, html_content, re.DOTALL)
post_num = $post_offset
for post_html in posts:
post_num += 1
# Extract author
author_match = re.search(author_pattern, post_html)
author = author_match.group(1) if author_match else "Unknown"
# Extract date
date_match = re.search(date_pattern, post_html)
date_str = date_match.group(1) if date_match else "Unknown date"
# Extract content
content_match = re.search(post_content_pattern, post_html, re.DOTALL)
if content_match:
content = content_match.group(1)
# Clean HTML
content = re.sub(r'<br\s*/?>', '\n', content)
content = re.sub(r'<[^>]+>', '', content)
content = html.unescape(content)
content = content.strip()
else:
content = "(Could not extract content)"
# Determine post type/score
score = "COMMUNITY"
if post_num == 1:
score = "ANN"
elif re.search(r'\[UPDATE\]|\[RELEASE\]|\[ANNOUNCEMENT\]', content, re.I):
score = "UPDATE"
elif '?' in content[:200]:
score = "QUESTION"
# Write post file
filename = f"$output_dir/POST-{post_num:04d}.md"
with open(filename, 'w') as f:
f.write(f"# Post #{post_num}\n\n")
f.write(f"## Metadata\n\n")
f.write(f"| Field | Value |\n")
f.write(f"|-------|-------|\n")
f.write(f"| Author | {author} |\n")
f.write(f"| Date | {date_str} |\n")
f.write(f"| Type | **{score}** |\n\n")
f.write(f"---\n\n")
f.write(f"## Content\n\n")
f.write(content)
f.write(f"\n")
print(f" Created POST-{post_num:04d}.md ({score}) by {author}")
print(f"EXTRACTED:{post_num}")
PYEOF
}
# Main collection function
collect_thread() {
local topic_id="$1"
local output_dir="$OUTPUT_BASE/bitcointalk-$topic_id"
mkdir -p "$output_dir/pages" "$output_dir/posts"
echo "=== Collecting BitcoinTalk Topic: $topic_id ==="
# Fetch first page to get thread info
fetch_page "$topic_id" 0 "$output_dir/pages/page-0.html"
# Extract thread title
local title=$(grep -oP '<title>\K[^<]+' "$output_dir/pages/page-0.html" | head -1)
echo "Thread: $title"
# Get total pages
local last_offset=$(get_last_page "$output_dir/pages/page-0.html")
local total_pages=$(( (last_offset / 20) + 1 ))
echo "Total pages: $total_pages"
if [ "$MAX_PAGES" -gt 0 ] && [ "$MAX_PAGES" -lt "$total_pages" ]; then
total_pages=$MAX_PAGES
echo "Limiting to: $total_pages pages"
fi
# Extract posts from first page
local post_count=0
local result=$(extract_posts_simple "$output_dir/pages/page-0.html" "$output_dir/posts" 0)
post_count=$(echo "$result" | grep "EXTRACTED:" | cut -d: -f2)
# Fetch remaining pages
for (( page=1; page<total_pages; page++ )); do
local offset=$((page * 20))
fetch_page "$topic_id" "$offset" "$output_dir/pages/page-$offset.html"
if ! page_has_posts "$output_dir/pages/page-$offset.html"; then
echo " No more posts found, stopping."
break
fi
result=$(extract_posts_simple "$output_dir/pages/page-$offset.html" "$output_dir/posts" "$post_count")
post_count=$(echo "$result" | grep "EXTRACTED:" | cut -d: -f2)
done
# Generate index
generate_index "$output_dir" "$title" "$topic_id" "$post_count"
echo ""
echo "=== Collection Complete ==="
echo "Posts: $post_count"
echo "Output: $output_dir/"
}
# Generate index file
generate_index() {
local output_dir="$1"
local title="$2"
local topic_id="$3"
local post_count="$4"
cat > "$output_dir/INDEX.md" << EOF
# BitcoinTalk Thread Archive
## Thread Info
| Field | Value |
|-------|-------|
| Title | $title |
| Topic ID | $topic_id |
| URL | https://bitcointalk.org/index.php?topic=$topic_id.0 |
| Posts Archived | $post_count |
| Collected | $(date +%Y-%m-%d) |
---
## Post Type Legend
| Type | Meaning |
|------|---------|
| ANN | Original announcement |
| UPDATE | Official team update |
| QUESTION | Community question |
| ANSWER | Team response |
| COMMUNITY | General discussion |
| CONCERN | Raised issue/criticism |
---
## Posts
| # | Author | Date | Type |
|---|--------|------|------|
EOF
for file in "$output_dir/posts/"POST-*.md; do
[ -f "$file" ] || continue
local num=$(basename "$file" .md | sed 's/POST-0*//')
local author=$(grep "| Author |" "$file" | sed 's/.*| Author | \(.*\) |/\1/')
local date=$(grep "| Date |" "$file" | sed 's/.*| Date | \(.*\) |/\1/')
local type=$(sed -n '/| Type |/s/.*\*\*\([A-Z]*\)\*\*.*/\1/p' "$file")
echo "| [$num](posts/POST-$(printf "%04d" $num).md) | $author | $date | $type |" >> "$output_dir/INDEX.md"
done
echo " Created INDEX.md"
}
# Parse arguments
main() {
local topic_input=""
for arg in "$@"; do
case "$arg" in
--pages=*) MAX_PAGES="${arg#*=}" ;;
--output=*) OUTPUT_BASE="${arg#*=}" ;;
--delay=*) DELAY="${arg#*=}" ;;
*) topic_input="$arg" ;;
esac
done
if [ -z "$topic_input" ]; then
echo "Usage: $0 <topic-id-or-url> [--pages=N] [--output=DIR] [--delay=2]"
echo ""
echo "Examples:"
echo " $0 2769739"
echo " $0 https://bitcointalk.org/index.php?topic=2769739.0"
echo " $0 2769739 --pages=5 --output=./lethean-ann"
exit 1
fi
local topic_id=$(parse_topic_id "$topic_input")
if [ -z "$topic_id" ]; then
echo "Error: Could not parse topic ID from: $topic_input"
exit 1
fi
collect_thread "$topic_id"
}
main "$@"

View file

@ -1,106 +0,0 @@
#!/usr/bin/env bash
# Generate block explorer collection jobs
# Usage: ./generate-jobs.sh <coin> [--blocks=N] [--sample=daily|weekly|monthly]
set -e
COIN=""
EXPLORER_URL=""
SAMPLE="monthly"
BLOCK_COUNT=100
# Known explorers
declare -A EXPLORERS=(
["lethean"]="https://explorer.lethean.io"
["monero"]="https://xmrchain.net"
["haven"]="https://explorer.havenprotocol.org"
["karbo"]="https://explorer.karbo.io"
["wownero"]="https://explore.wownero.com"
["dero"]="https://explorer.dero.io"
["masari"]="https://explorer.getmasari.org"
["turtlecoin"]="https://explorer.turtlecoin.lol"
["conceal"]="https://explorer.conceal.network"
)
# Parse args
for arg in "$@"; do
case "$arg" in
--url=*) EXPLORER_URL="${arg#*=}" ;;
--blocks=*) BLOCK_COUNT="${arg#*=}" ;;
--sample=*) SAMPLE="${arg#*=}" ;;
--*) ;;
*) COIN="$arg" ;;
esac
done
if [ -z "$COIN" ] && [ -z "$EXPLORER_URL" ]; then
echo "Usage: $0 <coin> [--url=URL] [--blocks=N] [--sample=daily|weekly|monthly]" >&2
echo "" >&2
echo "Known coins: ${!EXPLORERS[*]}" >&2
exit 1
fi
# Get explorer URL
if [ -z "$EXPLORER_URL" ]; then
EXPLORER_URL="${EXPLORERS[$COIN]}"
if [ -z "$EXPLORER_URL" ]; then
echo "# ERROR: Unknown coin '$COIN'. Use --url= to specify explorer." >&2
exit 1
fi
fi
SLUG=$(echo "$COIN" | tr '[:upper:]' '[:lower:]')
echo "# Block Explorer Jobs for $COIN"
echo "# Explorer: $EXPLORER_URL"
echo "# Sample: $SAMPLE"
echo "# Format: URL|FILENAME|TYPE|METADATA"
echo "#"
# Core API endpoints
echo "# === Core Data ==="
echo "${EXPLORER_URL}/api/info|explorer-${SLUG}-info.json|explorer-api|coin=$SLUG,type=info"
echo "${EXPLORER_URL}/api/emission|explorer-${SLUG}-emission.json|explorer-api|coin=$SLUG,type=emission"
echo "${EXPLORER_URL}/api/supply|explorer-${SLUG}-supply.json|explorer-api|coin=$SLUG,type=supply"
echo "${EXPLORER_URL}/api/mempool|explorer-${SLUG}-mempool.json|explorer-api|coin=$SLUG,type=mempool"
# Genesis block
echo "#"
echo "# === Genesis Block ==="
echo "${EXPLORER_URL}/api/block/0|explorer-${SLUG}-block-0.json|explorer-api|coin=$SLUG,block=0"
echo "${EXPLORER_URL}/api/block/1|explorer-${SLUG}-block-1.json|explorer-api|coin=$SLUG,block=1"
# Milestone blocks (if we know the heights)
echo "#"
echo "# === Milestone Blocks ==="
for height in 10000 50000 100000 500000 1000000 2000000; do
echo "${EXPLORER_URL}/api/block/${height}|explorer-${SLUG}-block-${height}.json|explorer-api|coin=$SLUG,block=$height"
done
# Sample blocks by time
echo "#"
echo "# === Sampled Blocks (estimate heights) ==="
case "$SAMPLE" in
daily)
# ~720 blocks/day for 2-min blocks
STEP=720
;;
weekly)
STEP=5040
;;
monthly)
STEP=21600
;;
esac
for ((i=0; i<BLOCK_COUNT; i++)); do
height=$((i * STEP))
echo "${EXPLORER_URL}/api/block/${height}|explorer-${SLUG}-sample-${height}.json|explorer-api|coin=$SLUG,block=$height,sample=$SAMPLE"
done
# Web pages (for scraping if API fails)
echo "#"
echo "# === Web Pages (backup) ==="
echo "${EXPLORER_URL}/|explorer-${SLUG}-home.html|explorer-web|coin=$SLUG"
echo "${EXPLORER_URL}/blocks|explorer-${SLUG}-blocks.html|explorer-web|coin=$SLUG"
echo "${EXPLORER_URL}/stats|explorer-${SLUG}-stats.html|explorer-web|coin=$SLUG"

View file

@ -1,89 +0,0 @@
#!/usr/bin/env bash
# Generate job list for CoinMarketCap collection
# Usage: ./generate-jobs.sh <coin-slug> [options] > jobs.txt
set -e
COINS=()
HISTORICAL=0
FROM_DATE="2017-01-01"
TO_DATE=$(date +%Y-%m-%d)
# Parse args
for arg in "$@"; do
case "$arg" in
--historical) HISTORICAL=1 ;;
--from=*) FROM_DATE="${arg#*=}" ;;
--to=*) TO_DATE="${arg#*=}" ;;
--*) ;;
*) COINS+=("$arg") ;;
esac
done
if [ ${#COINS[@]} -eq 0 ]; then
echo "Usage: $0 <coin-slug> [coin-slug...] [--historical] [--from=DATE] [--to=DATE]" >&2
echo "" >&2
echo "Examples:" >&2
echo " $0 lethean" >&2
echo " $0 lethean --historical --from=2018-01-01" >&2
echo " $0 lethean monero bitcoin" >&2
exit 1
fi
# Header
echo "# CoinMarketCap job list - $(date +%Y-%m-%d)"
echo "# Coins: ${COINS[*]}"
echo "# Format: URL|FILENAME|TYPE|METADATA"
echo "#"
for COIN in "${COINS[@]}"; do
SLUG=$(echo "$COIN" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9-]/-/g')
echo "# === $SLUG ==="
# Main page (current data, description, links)
echo "https://coinmarketcap.com/currencies/${SLUG}/|cmc-${SLUG}-main.html|cmc-main|coin=$SLUG"
# Markets/exchanges
echo "https://coinmarketcap.com/currencies/${SLUG}/markets/|cmc-${SLUG}-markets.html|cmc-markets|coin=$SLUG"
# Historical data page
echo "https://coinmarketcap.com/currencies/${SLUG}/historical-data/|cmc-${SLUG}-historical.html|cmc-historical|coin=$SLUG"
# News
echo "https://coinmarketcap.com/currencies/${SLUG}/news/|cmc-${SLUG}-news.html|cmc-news|coin=$SLUG"
# API endpoints (if accessible without auth)
# These return JSON and are more reliable than scraping
echo "https://api.coinmarketcap.com/data-api/v3/cryptocurrency/detail?slug=${SLUG}|cmc-${SLUG}-api-detail.json|cmc-api|coin=$SLUG,type=detail"
echo "https://api.coinmarketcap.com/data-api/v3/cryptocurrency/market-pairs/latest?slug=${SLUG}&limit=100|cmc-${SLUG}-api-markets.json|cmc-api|coin=$SLUG,type=markets"
# Historical data via API (may need date chunks)
if [ "$HISTORICAL" = "1" ]; then
echo "#"
echo "# Historical data: $FROM_DATE to $TO_DATE"
# Convert dates to timestamps
FROM_TS=$(date -j -f "%Y-%m-%d" "$FROM_DATE" "+%s" 2>/dev/null || date -d "$FROM_DATE" "+%s")
TO_TS=$(date -j -f "%Y-%m-%d" "$TO_DATE" "+%s" 2>/dev/null || date -d "$TO_DATE" "+%s")
# CMC historical API (public, limited)
echo "https://api.coinmarketcap.com/data-api/v3/cryptocurrency/historical?slug=${SLUG}&timeStart=${FROM_TS}&timeEnd=${TO_TS}|cmc-${SLUG}-api-historical.json|cmc-api|coin=$SLUG,type=historical"
# Also try the web scrape version with date range
echo "https://coinmarketcap.com/currencies/${SLUG}/historical-data/?start=${FROM_DATE//\-/}&end=${TO_DATE//\-/}|cmc-${SLUG}-historical-range.html|cmc-historical|coin=$SLUG,from=$FROM_DATE,to=$TO_DATE"
fi
echo "#"
done
echo "# === Additional data sources ==="
echo "#"
# CoinGecko as backup (often has more historical data)
for COIN in "${COINS[@]}"; do
SLUG=$(echo "$COIN" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9-]/-/g')
echo "https://www.coingecko.com/en/coins/${SLUG}|coingecko-${SLUG}-main.html|coingecko|coin=$SLUG"
echo "https://api.coingecko.com/api/v3/coins/${SLUG}|coingecko-${SLUG}-api.json|coingecko-api|coin=$SLUG"
echo "https://api.coingecko.com/api/v3/coins/${SLUG}/market_chart?vs_currency=usd&days=max|coingecko-${SLUG}-history.json|coingecko-api|coin=$SLUG,type=history"
done

View file

@ -1,226 +0,0 @@
#!/usr/bin/env bash
# Process downloaded CoinMarketCap data
# Usage: ./process.sh <downloads-dir> [--output=DIR]
set -e
DOWNLOADS="$1"
OUTPUT="./cmc-archive"
for arg in "$@"; do
case "$arg" in
--output=*) OUTPUT="${arg#*=}" ;;
esac
done
mkdir -p "$OUTPUT"
echo "=== Processing CoinMarketCap downloads ==="
# Process API JSON files first (most reliable)
for file in "$DOWNLOADS"/cmc-*-api-detail.json; do
[ -f "$file" ] || continue
COIN=$(basename "$file" | sed 's/cmc-\(.*\)-api-detail.json/\1/')
COIN_DIR="$OUTPUT/$COIN"
mkdir -p "$COIN_DIR"
echo "Processing: $COIN"
python3 << PYEOF
import json
import os
try:
data = json.load(open('$file', 'r'))
if 'data' in data:
coin = data['data']
# Extract metadata
metadata = {
'id': coin.get('id'),
'name': coin.get('name'),
'symbol': coin.get('symbol'),
'slug': coin.get('slug'),
'description': coin.get('description', ''),
'logo': coin.get('logo'),
'category': coin.get('category'),
'dateAdded': coin.get('dateAdded'),
'urls': coin.get('urls', {}),
'tags': coin.get('tags', []),
}
with open('$COIN_DIR/metadata.json', 'w') as f:
json.dump(metadata, f, indent=2)
print(f" Created metadata.json")
# Create markdown summary
with open('$COIN_DIR/INDEX.md', 'w') as f:
f.write(f"# {metadata['name']} ({metadata['symbol']})\n\n")
f.write(f"## Metadata\n\n")
f.write(f"| Field | Value |\n")
f.write(f"|-------|-------|\n")
f.write(f"| Name | {metadata['name']} |\n")
f.write(f"| Symbol | {metadata['symbol']} |\n")
f.write(f"| CMC ID | {metadata['id']} |\n")
f.write(f"| Added | {metadata['dateAdded']} |\n")
f.write(f"| Category | {metadata.get('category', 'N/A')} |\n\n")
f.write(f"## Links\n\n")
urls = metadata.get('urls', {})
for url_type, url_list in urls.items():
if url_list:
f.write(f"- **{url_type}**: {', '.join(url_list[:3])}\n")
f.write(f"\n## Description\n\n")
f.write(metadata.get('description', 'No description')[:2000])
f.write("\n")
print(f" Created INDEX.md")
except Exception as e:
print(f" Error processing: {e}")
PYEOF
done
# Process historical data
for file in "$DOWNLOADS"/cmc-*-api-historical.json; do
[ -f "$file" ] || continue
COIN=$(basename "$file" | sed 's/cmc-\(.*\)-api-historical.json/\1/')
COIN_DIR="$OUTPUT/$COIN"
mkdir -p "$COIN_DIR/historical"
echo "Processing historical: $COIN"
python3 << PYEOF
import json
import csv
from datetime import datetime
try:
data = json.load(open('$file', 'r'))
if 'data' in data and 'quotes' in data['data']:
quotes = data['data']['quotes']
# Group by year
by_year = {}
for quote in quotes:
ts = quote.get('timestamp', quote.get('time', ''))
if ts:
year = ts[:4]
if year not in by_year:
by_year[year] = []
by_year[year].append({
'date': ts[:10],
'open': quote.get('quote', {}).get('USD', {}).get('open', quote.get('open')),
'high': quote.get('quote', {}).get('USD', {}).get('high', quote.get('high')),
'low': quote.get('quote', {}).get('USD', {}).get('low', quote.get('low')),
'close': quote.get('quote', {}).get('USD', {}).get('close', quote.get('close')),
'volume': quote.get('quote', {}).get('USD', {}).get('volume', quote.get('volume')),
'market_cap': quote.get('quote', {}).get('USD', {}).get('market_cap', quote.get('market_cap')),
})
for year, rows in by_year.items():
filename = f'$COIN_DIR/historical/{year}.csv'
with open(filename, 'w', newline='') as f:
writer = csv.DictWriter(f, fieldnames=['date', 'open', 'high', 'low', 'close', 'volume', 'market_cap'])
writer.writeheader()
writer.writerows(sorted(rows, key=lambda x: x['date']))
print(f" Created historical/{year}.csv ({len(rows)} rows)")
except Exception as e:
print(f" Error: {e}")
PYEOF
done
# Process CoinGecko data as backup
for file in "$DOWNLOADS"/coingecko-*-api.json; do
[ -f "$file" ] || continue
COIN=$(basename "$file" | sed 's/coingecko-\(.*\)-api.json/\1/')
COIN_DIR="$OUTPUT/$COIN"
mkdir -p "$COIN_DIR"
echo "Processing CoinGecko: $COIN"
python3 << PYEOF
import json
try:
data = json.load(open('$file', 'r'))
# Extract useful fields
gecko_data = {
'coingecko_id': data.get('id'),
'coingecko_rank': data.get('coingecko_rank'),
'genesis_date': data.get('genesis_date'),
'sentiment_up': data.get('sentiment_votes_up_percentage'),
'sentiment_down': data.get('sentiment_votes_down_percentage'),
'developer_data': data.get('developer_data', {}),
'community_data': data.get('community_data', {}),
}
with open('$COIN_DIR/coingecko.json', 'w') as f:
json.dump(gecko_data, f, indent=2)
print(f" Created coingecko.json")
except Exception as e:
print(f" Error: {e}")
PYEOF
done
# Process market/exchange data
for file in "$DOWNLOADS"/cmc-*-api-markets.json; do
[ -f "$file" ] || continue
COIN=$(basename "$file" | sed 's/cmc-\(.*\)-api-markets.json/\1/')
COIN_DIR="$OUTPUT/$COIN"
mkdir -p "$COIN_DIR"
echo "Processing markets: $COIN"
python3 << PYEOF
import json
try:
data = json.load(open('$file', 'r'))
if 'data' in data and 'marketPairs' in data['data']:
pairs = data['data']['marketPairs']
markets = []
for pair in pairs[:50]: # Top 50 markets
markets.append({
'exchange': pair.get('exchangeName'),
'pair': pair.get('marketPair'),
'price': pair.get('price'),
'volume_24h': pair.get('volumeUsd'),
'type': pair.get('marketType'),
})
with open('$COIN_DIR/markets.json', 'w') as f:
json.dump(markets, f, indent=2)
# Add to INDEX.md
with open('$COIN_DIR/INDEX.md', 'a') as f:
f.write(f"\n## Markets (Top 10)\n\n")
f.write(f"| Exchange | Pair | Volume 24h |\n")
f.write(f"|----------|------|------------|\n")
for m in markets[:10]:
vol = m.get('volume_24h', 0)
vol_str = f"${vol:,.0f}" if vol else "N/A"
f.write(f"| {m['exchange']} | {m['pair']} | {vol_str} |\n")
print(f" Created markets.json ({len(markets)} pairs)")
except Exception as e:
print(f" Error: {e}")
PYEOF
done
echo ""
echo "=== Processing Complete ==="
echo "Output: $OUTPUT/"

View file

@ -1,124 +0,0 @@
#!/usr/bin/env bash
# Discover all collection sources for a CryptoNote project
# Usage: ./discover.sh <project-name> | ./discover.sh --abandoned | ./discover.sh --all
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REGISTRY="$SCRIPT_DIR/registry.json"
# Get project data from registry
get_project() {
local name="$1"
jq -r ".projects[] | select(.name | ascii_downcase == \"$(echo $name | tr '[:upper:]' '[:lower:]')\")" "$REGISTRY"
}
# List abandoned projects
list_abandoned() {
jq -r '.projects[] | select(.status == "abandoned" or .status == "low-activity" or .status == "dead") | .name' "$REGISTRY"
}
# List all projects
list_all() {
jq -r '.projects[].name' "$REGISTRY"
}
# Generate sources for a project
generate_sources() {
local name="$1"
local project=$(get_project "$name")
if [ -z "$project" ] || [ "$project" = "null" ]; then
echo "# ERROR: Project '$name' not found in registry" >&2
return 1
fi
local symbol=$(echo "$project" | jq -r '.symbol')
local status=$(echo "$project" | jq -r '.status')
echo "# === $name ($symbol) ==="
echo "# Status: $status"
echo "#"
# GitHub repos
echo "# GitHub Organizations:"
echo "$project" | jq -r '.github[]?' | while read org; do
[ -n "$org" ] && echo "github|https://github.com/$org|$name"
done
# BitcoinTalk
local btt=$(echo "$project" | jq -r '.bitcointalk // empty')
if [ -n "$btt" ]; then
echo "#"
echo "# BitcoinTalk:"
echo "bitcointalk|https://bitcointalk.org/index.php?topic=$btt.0|$name"
fi
# CMC/CoinGecko
local cmc=$(echo "$project" | jq -r '.cmc // empty')
local gecko=$(echo "$project" | jq -r '.coingecko // empty')
echo "#"
echo "# Market Data:"
[ -n "$cmc" ] && echo "cmc|https://coinmarketcap.com/currencies/$cmc/|$name"
[ -n "$gecko" ] && echo "coingecko|https://coingecko.com/en/coins/$gecko|$name"
# Website/Explorer
local website=$(echo "$project" | jq -r '.website // empty')
local explorer=$(echo "$project" | jq -r '.explorer // empty')
echo "#"
echo "# Web Properties:"
[ -n "$website" ] && echo "wayback|https://$website|$name"
[ -n "$explorer" ] && echo "explorer|https://$explorer|$name"
# Salvageable features
local salvage=$(echo "$project" | jq -r '.salvageable[]?' 2>/dev/null)
if [ -n "$salvage" ]; then
echo "#"
echo "# Salvageable:"
echo "$project" | jq -r '.salvageable[]?' | while read item; do
echo "# - $item"
done
fi
echo "#"
}
# Main
case "$1" in
--abandoned)
echo "# Abandoned CryptoNote Projects (Salvage Candidates)"
echo "# Format: source|url|project"
echo "#"
for proj in $(list_abandoned); do
generate_sources "$proj"
done
;;
--all)
echo "# All CryptoNote Projects"
echo "# Format: source|url|project"
echo "#"
for proj in $(list_all); do
generate_sources "$proj"
done
;;
--list)
list_all
;;
--list-abandoned)
list_abandoned
;;
"")
echo "Usage: $0 <project-name> | --abandoned | --all | --list" >&2
echo "" >&2
echo "Examples:" >&2
echo " $0 lethean # Sources for Lethean" >&2
echo " $0 monero # Sources for Monero" >&2
echo " $0 --abandoned # All abandoned projects" >&2
echo " $0 --all # Everything" >&2
echo " $0 --list # Just list project names" >&2
exit 1
;;
*)
generate_sources "$1"
;;
esac

View file

@ -1,137 +1,78 @@
# GitHub History Collection Skill
Collect and score GitHub issues and PRs for triage analysis.
Collect GitHub repositories, issues, and PRs for archival and triage analysis.
## Prerequisites
```bash
# Install Borg
go install github.com/Snider/Borg@latest
```
## Usage
```bash
# Single repo
./collect.sh https://github.com/LetheanNetwork/lthn-app-vpn
# Clone a single repository
borg collect github repo https://github.com/LetheanNetwork/lthn-app-vpn
# Entire org (all repos)
./collect.sh https://github.com/LetheanNetwork --org
# Clone all repos from an org
borg collect github repos LetheanNetwork
# Just issues (skip PRs)
./collect.sh https://github.com/LetheanNetwork/lthn-app-vpn --issues-only
# Just PRs (skip issues)
./collect.sh https://github.com/LetheanNetwork/lthn-app-vpn --prs-only
# Custom rate limit delay
./collect.sh https://github.com/LetheanNetwork --org --delay=0.5
# Output to encrypted container
borg collect github repos LetheanNetwork --format stim -o lethean.stim
```
## Output Structure
## Target Registry
```
repo/
├── {org}/
│ └── {repo}/
│ ├── Issue/
│ │ ├── 001.md # Sequential, no gaps
│ │ ├── 002.md
│ │ ├── 003.md
│ │ └── INDEX.md # Scored index
│ ├── PR/
│ │ ├── 001.md
│ │ ├── 002.md
│ │ └── INDEX.md
│ └── .json/ # Raw API responses
│ ├── issues-list.json
│ ├── issue-{n}.json
│ ├── prs-list.json
│ └── pr-{n}.json
```
### Lethean Ecosystem
- `LetheanNetwork`
- `letheanVPN`
- `LetheanMovement`
### Sequential vs GitHub Numbers
### CryptoNote Active
- `monero-project`
- `hyle-team`
- `zanoio`
- `kevacoin-project`
- `scala-network`
- `Karbovanets`
- `wownero`
- `ConcealNetwork`
- `ryo-currency`
- **Filename**: `001.md`, `002.md`, etc. - sequential, no gaps
- **Inside file**: `# Issue #47: ...` - preserves original GitHub number
- **INDEX.md**: Maps both: `| 001 | #47 | Title | SCORE |`
### Salvage Priority (dead/abandoned)
- `haven-protocol-org`
- `graft-project`
- `graft-community`
- `oxen-io`
- `loki-project`
- `turtlecoin`
- `masari-project`
- `aeonix`
- `nerva-project`
- `sumoprojects`
- `deroproject`
- `bcndev`
- `electroneum`
This ensures clean sequential browsing while maintaining traceability to GitHub.
## Reception Scores
| Score | Meaning | Triage Action |
|-------|---------|---------------|
| ADDRESSED | Closed after discussion | Review if actually fixed |
| DISMISSED | Labeled wontfix/invalid | **RECLAIM candidate** |
| IGNORED | Closed, no response | **RECLAIM candidate** |
| STALE | Open, no replies | Needs attention |
| ACTIVE | Open with discussion | In progress |
| MERGED | PR accepted | Done |
| REJECTED | PR closed unmerged | Review why |
| PENDING | PR still open | Needs review |
## Requirements
- `gh` CLI authenticated (`gh auth login`)
- `jq` installed
### Non-CN Reference
- `theQRL`
- `hyperswarm`
- `holepunchto`
- `openhive-network`
- `octa-space`
## Batch Collection
Supports comma-separated targets for batch runs:
```bash
# Batch orgs
./collect.sh "LetheanNetwork,graft-project,oxen-io" --org
# Batch repos
./collect.sh "LetheanNetwork/lthn-app-vpn,monero-project/monero"
# Collect everything into encrypted archive
borg collect github repos LetheanNetwork,monero-project,graft-project \
--format stim -o cryptonote-archive.stim
```
## Full Registry List
## Triage Workflow
Copy-paste ready commands for the complete CryptoNote ecosystem:
```bash
# === LETHEAN ECOSYSTEM ===
./collect.sh "LetheanNetwork,letheanVPN,LetheanMovement" --org
# === CRYPTONOTE ACTIVE ===
./collect.sh "monero-project,hyle-team,zanoio,kevacoin-project,scala-network" --org
./collect.sh "Karbovanets,wownero,ConcealNetwork,ryo-currency" --org
# === SALVAGE PRIORITY (dead/abandoned) ===
./collect.sh "haven-protocol-org,graft-project,graft-community" --org
./collect.sh "oxen-io,loki-project" --org
./collect.sh "turtlecoin,masari-project,aeonix,nerva-project,sumoprojects" --org
./collect.sh "deroproject,bcndev,electroneum" --org
# === NON-CN REFERENCE ===
./collect.sh "theQRL,hyperswarm,holepunchto,openhive-network,octa-space" --org
```
### One-liner for everything
```bash
./collect.sh "LetheanNetwork,letheanVPN,LetheanMovement,monero-project,haven-protocol-org,hyle-team,zanoio,kevacoin-project,scala-network,deroproject,Karbovanets,wownero,turtlecoin,masari-project,aeonix,oxen-io,loki-project,graft-project,graft-community,nerva-project,ConcealNetwork,ryo-currency,sumoprojects,bcndev,electroneum,theQRL,hyperswarm,holepunchto,openhive-network,octa-space" --org
```
## Example Run
```bash
$ ./collect.sh "LetheanNetwork,graft-project" --org
=== Collecting all repos from org: LetheanNetwork ===
=== Collecting: LetheanNetwork/lthn-app-vpn ===
Output: ./repo/LetheanNetwork/lthn-app-vpn/
Fetching issues...
Found 145 issues
Fetching issue #1 -> 001.md
...
Created Issue/INDEX.md
Fetching PRs...
Found 98 PRs
...
Created PR/INDEX.md
=== Collecting all repos from org: graft-project ===
=== Collecting: graft-project/graft-network ===
Output: ./repo/graft-project/graft-network/
...
=== Collection Complete ===
Output: ./repo/
```
1. Collect repos with Borg
2. Review issues marked DISMISSED or IGNORED
3. Identify salvageable features
4. Document in project-archaeology skill

View file

@ -1,516 +0,0 @@
#!/usr/bin/env bash
# GitHub History Collector v2
# Usage: ./collect.sh <target> [--org] [--issues-only] [--prs-only]
#
# Supports:
# Single repo: ./collect.sh LetheanNetwork/lthn-app-vpn
# Single org: ./collect.sh LetheanNetwork --org
# Batch orgs: ./collect.sh "LetheanNetwork,graft-project,oxen-io" --org
# Batch repos: ./collect.sh "owner/repo1,owner/repo2"
#
# Output structure:
# repo/{org}/{repo}/Issue/001.md, 002.md, ...
# repo/{org}/{repo}/PR/001.md, 002.md, ...
#
# Rate limiting:
# --check-rate Just show current rate limit status and exit
# Auto-pauses at 25% remaining (75% used) until reset+10s (preserves GraphQL quota)
set -e
# GitHub API allows 5000 requests/hour authenticated
# 0.05s = 20 req/sec = safe margin, bump to 0.1 if rate limited
DELAY=0.05
OUTPUT_BASE="./repo"
# Rate limit protection - check every N calls, pause if under 25% (75% used)
API_CALL_COUNT=0
RATE_CHECK_INTERVAL=100
check_rate_limit() {
local rate_json=$(gh api rate_limit 2>/dev/null)
if [ -z "$rate_json" ]; then
echo " [Rate check failed, continuing...]"
return
fi
local remaining=$(echo "$rate_json" | jq -r '.resources.core.remaining')
local limit=$(echo "$rate_json" | jq -r '.resources.core.limit')
local reset=$(echo "$rate_json" | jq -r '.resources.core.reset')
local percent=$((remaining * 100 / limit))
echo ""
echo ">>> Rate check: ${percent}% remaining ($remaining/$limit)"
if [ "$percent" -lt 25 ]; then
local now=$(date +%s)
local wait_time=$((reset - now + 10))
if [ "$wait_time" -gt 0 ]; then
local resume_time=$(date -d "@$((reset + 10))" '+%H:%M:%S' 2>/dev/null || date -r "$((reset + 10))" '+%H:%M:%S' 2>/dev/null || echo "reset+10s")
echo ">>> Under 25% - pausing ${wait_time}s until $resume_time"
echo ">>> (GraphQL quota preserved for other tools)"
sleep "$wait_time"
echo ">>> Resuming collection..."
fi
else
echo ">>> Above 25% - continuing..."
fi
echo ""
}
track_api_call() {
API_CALL_COUNT=$((API_CALL_COUNT + 1))
if [ $((API_CALL_COUNT % RATE_CHECK_INTERVAL)) -eq 0 ]; then
check_rate_limit
fi
}
# Parse URL into org/repo
parse_github_url() {
local url="$1"
url="${url#https://github.com/}"
url="${url#http://github.com/}"
url="${url%/}"
echo "$url"
}
# Collect single repo
collect_repo() {
local repo="$1" # format: org/repo-name
local org=$(dirname "$repo")
local repo_name=$(basename "$repo")
local issue_dir="$OUTPUT_BASE/$org/$repo_name/Issue"
local pr_dir="$OUTPUT_BASE/$org/$repo_name/PR"
local json_dir="$OUTPUT_BASE/$org/$repo_name/.json"
mkdir -p "$issue_dir" "$pr_dir" "$json_dir"
echo "=== Collecting: $repo ==="
echo " Output: $OUTPUT_BASE/$org/$repo_name/"
# Collect Issues
if [ "$SKIP_ISSUES" != "1" ]; then
echo "Fetching issues..."
if ! gh issue list --repo "$repo" --state all --limit 500 \
--json number,title,state,author,labels,createdAt,closedAt,body \
> "$json_dir/issues-list.json" 2>/dev/null; then
echo " (issues disabled or not accessible)"
echo "[]" > "$json_dir/issues-list.json"
fi
track_api_call
local issue_count=$(jq length "$json_dir/issues-list.json")
echo " Found $issue_count issues"
# Fetch each issue
local seq=0
for github_num in $(jq -r '.[].number' "$json_dir/issues-list.json" | sort -n); do
seq=$((seq + 1))
local seq_padded=$(printf '%03d' $seq)
# Skip if already fetched
if [ -f "$json_dir/issue-$github_num.json" ] && [ -f "$issue_dir/$seq_padded.md" ]; then
echo " Skipping issue #$github_num (already exists)"
continue
fi
echo " Fetching issue #$github_num -> $seq_padded.md"
gh issue view "$github_num" --repo "$repo" \
--json number,title,state,author,labels,createdAt,closedAt,body,comments \
> "$json_dir/issue-$github_num.json"
track_api_call
# Convert to markdown with sequential filename
convert_issue "$json_dir/issue-$github_num.json" "$issue_dir/$seq_padded.md" "$github_num"
sleep $DELAY
done
generate_issue_index "$issue_dir"
fi
# Collect PRs
if [ "$SKIP_PRS" != "1" ]; then
echo "Fetching PRs..."
if ! gh pr list --repo "$repo" --state all --limit 500 \
--json number,title,state,author,createdAt,closedAt,mergedAt,body \
> "$json_dir/prs-list.json" 2>/dev/null; then
echo " (PRs disabled or not accessible)"
echo "[]" > "$json_dir/prs-list.json"
fi
track_api_call
local pr_count=$(jq length "$json_dir/prs-list.json")
echo " Found $pr_count PRs"
# Fetch each PR
local seq=0
for github_num in $(jq -r '.[].number' "$json_dir/prs-list.json" | sort -n); do
seq=$((seq + 1))
local seq_padded=$(printf '%03d' $seq)
# Skip if already fetched
if [ -f "$json_dir/pr-$github_num.json" ] && [ -f "$pr_dir/$seq_padded.md" ]; then
echo " Skipping PR #$github_num (already exists)"
continue
fi
echo " Fetching PR #$github_num -> $seq_padded.md"
gh pr view "$github_num" --repo "$repo" \
--json number,title,state,author,createdAt,closedAt,mergedAt,body,comments,reviews \
> "$json_dir/pr-$github_num.json" 2>/dev/null || true
track_api_call
# Convert to markdown with sequential filename
convert_pr "$json_dir/pr-$github_num.json" "$pr_dir/$seq_padded.md" "$github_num"
sleep $DELAY
done
generate_pr_index "$pr_dir"
fi
}
# Collect all repos in org
collect_org() {
local org="$1"
echo "=== Collecting all repos from org: $org ==="
# Get repo list (1 API call)
local repos
repos=$(gh repo list "$org" --limit 500 --json nameWithOwner -q '.[].nameWithOwner')
track_api_call
while read -r repo; do
[ -n "$repo" ] || continue
collect_repo "$repo"
sleep $DELAY
done <<< "$repos"
}
# Convert issue JSON to markdown
convert_issue() {
local json_file="$1"
local output_file="$2"
local github_num="$3"
local title=$(jq -r '.title' "$json_file")
local state=$(jq -r '.state' "$json_file")
local author=$(jq -r '.author.login' "$json_file")
local created=$(jq -r '.createdAt' "$json_file" | cut -d'T' -f1)
local closed=$(jq -r '.closedAt // "N/A"' "$json_file" | cut -d'T' -f1)
local body=$(jq -r '.body // "No description"' "$json_file")
local labels=$(jq -r '[.labels[].name] | join(", ")' "$json_file")
local comment_count=$(jq '.comments | length' "$json_file")
# Score reception
local score="UNKNOWN"
local reason=""
if [ "$state" = "CLOSED" ]; then
if echo "$labels" | grep -qi "wontfix\|invalid\|duplicate\|won't fix"; then
score="DISMISSED"
reason="Labeled as wontfix/invalid/duplicate"
elif [ "$comment_count" -eq 0 ]; then
score="IGNORED"
reason="Closed with no discussion"
else
score="ADDRESSED"
reason="Closed after discussion"
fi
else
if [ "$comment_count" -eq 0 ]; then
score="STALE"
reason="Open with no response"
else
score="ACTIVE"
reason="Open with discussion"
fi
fi
cat > "$output_file" << ISSUE_EOF
# Issue #$github_num: $title
## Reception Score
| Score | Reason |
|-------|--------|
| **$score** | $reason |
---
## Metadata
| Field | Value |
|-------|-------|
| GitHub # | $github_num |
| State | $state |
| Author | @$author |
| Created | $created |
| Closed | $closed |
| Labels | $labels |
| Comments | $comment_count |
---
## Original Post
**Author:** @$author
$body
---
## Discussion Thread
ISSUE_EOF
jq -r '.comments[] | "### Comment by @\(.author.login)\n\n**Date:** \(.createdAt | split("T")[0])\n\n\(.body)\n\n---\n"' "$json_file" >> "$output_file" 2>/dev/null || true
}
# Convert PR JSON to markdown
convert_pr() {
local json_file="$1"
local output_file="$2"
local github_num="$3"
[ -f "$json_file" ] || return
local title=$(jq -r '.title' "$json_file")
local state=$(jq -r '.state' "$json_file")
local author=$(jq -r '.author.login' "$json_file")
local created=$(jq -r '.createdAt' "$json_file" | cut -d'T' -f1)
local merged=$(jq -r '.mergedAt // "N/A"' "$json_file" | cut -d'T' -f1)
local body=$(jq -r '.body // "No description"' "$json_file")
local score="UNKNOWN"
local reason=""
if [ "$state" = "MERGED" ] || { [ "$merged" != "N/A" ] && [ "$merged" != "null" ]; }; then
score="MERGED"
reason="Contribution accepted"
elif [ "$state" = "CLOSED" ]; then
score="REJECTED"
reason="PR closed without merge"
else
score="PENDING"
reason="Still open"
fi
cat > "$output_file" << PR_EOF
# PR #$github_num: $title
## Reception Score
| Score | Reason |
|-------|--------|
| **$score** | $reason |
---
## Metadata
| Field | Value |
|-------|-------|
| GitHub # | $github_num |
| State | $state |
| Author | @$author |
| Created | $created |
| Merged | $merged |
---
## Description
$body
---
## Reviews & Comments
PR_EOF
jq -r '.comments[]? | "### Comment by @\(.author.login)\n\n\(.body)\n\n---\n"' "$json_file" >> "$output_file" 2>/dev/null || true
jq -r '.reviews[]? | "### Review by @\(.author.login) [\(.state)]\n\n\(.body // "No comment")\n\n---\n"' "$json_file" >> "$output_file" 2>/dev/null || true
}
# Generate Issue index
generate_issue_index() {
local dir="$1"
cat > "$dir/INDEX.md" << 'INDEX_HEADER'
# Issues Index
## Reception Score Legend
| Score | Meaning | Action |
|-------|---------|--------|
| ADDRESSED | Closed after discussion | Review if actually fixed |
| DISMISSED | Labeled wontfix/invalid | **RECLAIM candidate** |
| IGNORED | Closed, no response | **RECLAIM candidate** |
| STALE | Open, no replies | Needs attention |
| ACTIVE | Open with discussion | In progress |
---
## Issues
| Seq | GitHub # | Title | Score |
|-----|----------|-------|-------|
INDEX_HEADER
for file in "$dir"/[0-9]*.md; do
[ -f "$file" ] || continue
local seq=$(basename "$file" .md)
local github_num=$(sed -n 's/^# Issue #\([0-9]*\):.*/\1/p' "$file")
local title=$(head -1 "$file" | sed 's/^# Issue #[0-9]*: //')
local score=$(sed -n '/\*\*[A-Z]/s/.*\*\*\([A-Z]*\)\*\*.*/\1/p' "$file" | head -1)
echo "| [$seq]($seq.md) | #$github_num | $title | $score |" >> "$dir/INDEX.md"
done
echo " Created Issue/INDEX.md"
}
# Generate PR index
generate_pr_index() {
local dir="$1"
cat > "$dir/INDEX.md" << 'INDEX_HEADER'
# Pull Requests Index
## Reception Score Legend
| Score | Meaning | Action |
|-------|---------|--------|
| MERGED | PR accepted | Done |
| REJECTED | PR closed unmerged | Review why |
| PENDING | PR still open | Needs review |
---
## Pull Requests
| Seq | GitHub # | Title | Score |
|-----|----------|-------|-------|
INDEX_HEADER
for file in "$dir"/[0-9]*.md; do
[ -f "$file" ] || continue
local seq=$(basename "$file" .md)
local github_num=$(sed -n 's/^# PR #\([0-9]*\):.*/\1/p' "$file")
local title=$(head -1 "$file" | sed 's/^# PR #[0-9]*: //')
local score=$(sed -n '/\*\*[A-Z]/s/.*\*\*\([A-Z]*\)\*\*.*/\1/p' "$file" | head -1)
echo "| [$seq]($seq.md) | #$github_num | $title | $score |" >> "$dir/INDEX.md"
done
echo " Created PR/INDEX.md"
}
# Show rate limit status
show_rate_status() {
local rate_json=$(gh api rate_limit 2>/dev/null)
if [ -z "$rate_json" ]; then
echo "Failed to fetch rate limit"
exit 1
fi
echo "=== GitHub API Rate Limit Status ==="
echo ""
echo "Core (REST API):"
echo " Remaining: $(echo "$rate_json" | jq -r '.resources.core.remaining') / $(echo "$rate_json" | jq -r '.resources.core.limit')"
local core_reset=$(echo "$rate_json" | jq -r '.resources.core.reset')
echo " Reset: $(date -d "@$core_reset" '+%H:%M:%S' 2>/dev/null || date -r "$core_reset" '+%H:%M:%S' 2>/dev/null || echo "$core_reset")"
echo ""
echo "GraphQL:"
echo " Remaining: $(echo "$rate_json" | jq -r '.resources.graphql.remaining') / $(echo "$rate_json" | jq -r '.resources.graphql.limit')"
local gql_reset=$(echo "$rate_json" | jq -r '.resources.graphql.reset')
echo " Reset: $(date -d "@$gql_reset" '+%H:%M:%S' 2>/dev/null || date -r "$gql_reset" '+%H:%M:%S' 2>/dev/null || echo "$gql_reset")"
echo ""
echo "Search:"
echo " Remaining: $(echo "$rate_json" | jq -r '.resources.search.remaining') / $(echo "$rate_json" | jq -r '.resources.search.limit')"
echo ""
}
# Main
main() {
local targets=""
local is_org=0
SKIP_ISSUES=0
SKIP_PRS=0
# Parse args
for arg in "$@"; do
case "$arg" in
--org) is_org=1 ;;
--issues-only) SKIP_PRS=1 ;;
--prs-only) SKIP_ISSUES=1 ;;
--delay=*) DELAY="${arg#*=}" ;;
--check-rate) show_rate_status; exit 0 ;;
https://*|http://*) targets="$arg" ;;
-*) ;; # ignore unknown flags
*) targets="$arg" ;;
esac
done
if [ -z "$targets" ]; then
echo "Usage: $0 <target> [--org] [--issues-only] [--prs-only] [--delay=0.05] [--check-rate]"
echo ""
echo "Options:"
echo " --check-rate Show rate limit status (Core/GraphQL/Search) and exit"
echo " --delay=N Delay between requests (default: 0.05s)"
echo ""
echo "Rate limiting: Auto-pauses at 25% remaining (75% used) until reset+10s"
echo ""
echo "Target formats:"
echo " Single repo: LetheanNetwork/lthn-app-vpn"
echo " Single org: LetheanNetwork --org"
echo " Batch orgs: \"LetheanNetwork,graft-project,oxen-io\" --org"
echo " Batch repos: \"owner/repo1,owner/repo2\""
echo ""
echo "Output: repo/{org}/{repo}/Issue/ repo/{org}/{repo}/PR/"
echo ""
echo "Full registry list (copy-paste ready):"
echo ""
echo " # Lethean ecosystem"
echo " $0 \"LetheanNetwork,letheanVPN,LetheanMovement\" --org"
echo ""
echo " # CryptoNote projects"
echo " $0 \"monero-project,haven-protocol-org,hyle-team,zanoio\" --org"
echo " $0 \"kevacoin-project,scala-network,deroproject\" --org"
echo " $0 \"Karbovanets,wownero,turtlecoin\" --org"
echo " $0 \"masari-project,aeonix,nerva-project\" --org"
echo " $0 \"ConcealNetwork,ryo-currency,sumoprojects\" --org"
echo " $0 \"bcndev,electroneum\" --org"
echo ""
echo " # Dead/salvage priority"
echo " $0 \"graft-project,graft-community,oxen-io,loki-project\" --org"
echo ""
echo " # Non-CN reference projects"
echo " $0 \"theQRL,hyperswarm,holepunchto,openhive-network,octa-space\" --org"
exit 1
fi
# Handle comma-separated list
IFS=',' read -ra TARGET_LIST <<< "$targets"
for target in "${TARGET_LIST[@]}"; do
# Trim whitespace
target=$(echo "$target" | xargs)
local parsed=$(parse_github_url "$target")
if [ "$is_org" = "1" ]; then
collect_org "$parsed"
else
collect_repo "$parsed"
fi
done
echo ""
echo "=== Collection Complete ==="
echo "Output: $OUTPUT_BASE/"
}
main "$@"

View file

@ -1,107 +0,0 @@
#!/usr/bin/env bash
# Generate job list for proxy-based collection
# Usage: ./generate-jobs.sh <source> <target> [options] > jobs.txt
set -e
SOURCE="$1"
TARGET="$2"
shift 2 || true
# Defaults
LIMIT=1000
PAGES=100
# Parse options
for arg in "$@"; do
case "$arg" in
--limit=*) LIMIT="${arg#*=}" ;;
--pages=*) PAGES="${arg#*=}" ;;
esac
done
# Output header
echo "# Job list generated $(date +%Y-%m-%d\ %H:%M)"
echo "# Source: $SOURCE | Target: $TARGET"
echo "# Format: URL|FILENAME|TYPE|METADATA"
echo "#"
case "$SOURCE" in
bitcointalk|btt)
# Extract topic ID
TOPIC_ID=$(echo "$TARGET" | grep -oE '[0-9]+' | head -1)
echo "# BitcoinTalk topic: $TOPIC_ID"
echo "#"
# Generate page URLs (20 posts per page)
for ((i=0; i<PAGES*20; i+=20)); do
echo "https://bitcointalk.org/index.php?topic=${TOPIC_ID}.${i}|btt-${TOPIC_ID}-p${i}.html|bitcointalk|page=$((i/20)),offset=$i"
done
;;
reddit)
# Handle r/subreddit or full URL
SUBREDDIT=$(echo "$TARGET" | sed 's|.*/r/||' | sed 's|/.*||')
echo "# Reddit: r/$SUBREDDIT"
echo "#"
# Subreddit pages (top, new, hot)
for sort in "top" "new" "hot"; do
echo "https://old.reddit.com/r/${SUBREDDIT}/${sort}/.json?limit=100|reddit-${SUBREDDIT}-${sort}.json|reddit|sort=$sort"
done
# If it's a specific thread
if [[ "$TARGET" =~ comments/([a-z0-9]+) ]]; then
THREAD_ID="${BASH_REMATCH[1]}"
echo "https://old.reddit.com/r/${SUBREDDIT}/comments/${THREAD_ID}.json|reddit-thread-${THREAD_ID}.json|reddit|thread=$THREAD_ID"
fi
;;
wayback|archive)
# Clean domain
DOMAIN=$(echo "$TARGET" | sed 's|https\?://||' | sed 's|/.*||')
echo "# Wayback Machine: $DOMAIN"
echo "#"
# CDX API to get all snapshots
echo "https://web.archive.org/cdx/search/cdx?url=${DOMAIN}/*&output=json&limit=${LIMIT}|wayback-${DOMAIN}-cdx.json|wayback-index|domain=$DOMAIN"
# Common important pages
for path in "" "index.html" "about" "roadmap" "team" "whitepaper" "faq"; do
echo "https://web.archive.org/web/2020/${DOMAIN}/${path}|wayback-${DOMAIN}-2020-${path:-index}.html|wayback|year=2020,path=$path"
echo "https://web.archive.org/web/2021/${DOMAIN}/${path}|wayback-${DOMAIN}-2021-${path:-index}.html|wayback|year=2021,path=$path"
echo "https://web.archive.org/web/2022/${DOMAIN}/${path}|wayback-${DOMAIN}-2022-${path:-index}.html|wayback|year=2022,path=$path"
done
;;
medium)
# Handle @author or publication
AUTHOR=$(echo "$TARGET" | sed 's|.*/||' | sed 's|^@||')
echo "# Medium: @$AUTHOR"
echo "#"
# Medium RSS feed (easier to parse)
echo "https://medium.com/feed/@${AUTHOR}|medium-${AUTHOR}-feed.xml|medium-rss|author=$AUTHOR"
# Profile page
echo "https://medium.com/@${AUTHOR}|medium-${AUTHOR}-profile.html|medium|author=$AUTHOR"
;;
twitter|x)
USERNAME=$(echo "$TARGET" | sed 's|.*/||' | sed 's|^@||')
echo "# Twitter/X: @$USERNAME"
echo "# Note: Twitter requires auth - use nitter or API"
echo "#"
# Nitter instances (public, no auth)
echo "https://nitter.net/${USERNAME}|twitter-${USERNAME}.html|nitter|user=$USERNAME"
echo "https://nitter.net/${USERNAME}/with_replies|twitter-${USERNAME}-replies.html|nitter|user=$USERNAME,type=replies"
;;
*)
echo "# ERROR: Unknown source '$SOURCE'" >&2
echo "# Supported: bitcointalk, reddit, wayback, medium, twitter" >&2
exit 1
;;
esac

View file

@ -1,242 +0,0 @@
#!/usr/bin/env bash
# Process downloaded files into markdown
# Usage: ./process.sh <source> <downloads-dir> [--output=DIR]
set -e
SOURCE="$1"
DOWNLOADS="$2"
shift 2 || true
OUTPUT="./processed"
for arg in "$@"; do
case "$arg" in
--output=*) OUTPUT="${arg#*=}" ;;
esac
done
mkdir -p "$OUTPUT/posts"
echo "=== Processing $SOURCE files from $DOWNLOADS ==="
case "$SOURCE" in
bitcointalk|btt)
echo "Processing BitcoinTalk pages..."
POST_NUM=0
for file in "$DOWNLOADS"/btt-*.html; do
[ -f "$file" ] || continue
echo " Processing: $(basename "$file")"
python3 << PYEOF
import re
import html
import os
html_content = open('$file', 'r', encoding='utf-8', errors='ignore').read()
# Extract thread title from first page
title_match = re.search(r'<title>([^<]+)</title>', html_content)
title = title_match.group(1) if title_match else "Unknown Thread"
title = title.replace(' - Bitcoin Forum', '').strip()
with open('$OUTPUT/.thread_title', 'w') as f:
f.write(title)
# Pattern for posts
post_blocks = re.findall(r'<div class="post"[^>]*id="msg(\d+)"[^>]*>(.*?)</div>\s*(?:<div class="moderatorbar"|<div class="signature">)', html_content, re.DOTALL)
for msg_id, content in post_blocks:
# Clean content
content = re.sub(r'<br\s*/?>', '\n', content)
content = re.sub(r'<[^>]+>', '', content)
content = html.unescape(content).strip()
if content:
post_num = $POST_NUM + 1
$POST_NUM = post_num
with open(f'$OUTPUT/posts/POST-{post_num:04d}.md', 'w') as f:
f.write(f"# Post #{post_num}\\n\\n")
f.write(f"Message ID: {msg_id}\\n\\n")
f.write(f"---\\n\\n")
f.write(content)
f.write("\\n")
print(f" POST-{post_num:04d}.md")
print(f"TOTAL:{$POST_NUM}")
PYEOF
done
# Generate index
TITLE=$(cat "$OUTPUT/.thread_title" 2>/dev/null || echo "BitcoinTalk Thread")
TOTAL=$(ls "$OUTPUT/posts/"POST-*.md 2>/dev/null | wc -l)
cat > "$OUTPUT/INDEX.md" << EOF
# $TITLE
Archived from BitcoinTalk
| Posts | $(echo $TOTAL) |
|-------|------|
## Posts
EOF
for f in "$OUTPUT/posts/"POST-*.md; do
[ -f "$f" ] || continue
NUM=$(basename "$f" .md | sed 's/POST-0*//')
echo "- [Post #$NUM](posts/$(basename $f))" >> "$OUTPUT/INDEX.md"
done
;;
reddit)
echo "Processing Reddit JSON..."
for file in "$DOWNLOADS"/reddit-*.json; do
[ -f "$file" ] || continue
echo " Processing: $(basename "$file")"
python3 << PYEOF
import json
import os
data = json.load(open('$file', 'r'))
# Handle different Reddit JSON structures
posts = []
if isinstance(data, list) and len(data) > 0:
if 'data' in data[0]:
# Thread format
posts = data[0]['data']['children']
else:
posts = data
elif isinstance(data, dict) and 'data' in data:
posts = data['data']['children']
for i, post_wrapper in enumerate(posts):
post = post_wrapper.get('data', post_wrapper)
title = post.get('title', post.get('body', '')[:50])
author = post.get('author', 'unknown')
score = post.get('score', 0)
body = post.get('selftext', post.get('body', ''))
created = post.get('created_utc', 0)
filename = f'$OUTPUT/posts/REDDIT-{i+1:04d}.md'
with open(filename, 'w') as f:
f.write(f"# {title}\\n\\n")
f.write(f"| Author | u/{author} |\\n")
f.write(f"|--------|----------|\\n")
f.write(f"| Score | {score} |\\n\\n")
f.write(f"---\\n\\n")
f.write(body or "(no content)")
f.write("\\n")
print(f" REDDIT-{i+1:04d}.md - {title[:40]}...")
PYEOF
done
;;
wayback)
echo "Processing Wayback Machine files..."
for file in "$DOWNLOADS"/wayback-*.html; do
[ -f "$file" ] || continue
BASENAME=$(basename "$file" .html)
echo " Processing: $BASENAME"
# Extract text content
python3 << PYEOF
import re
import html
content = open('$file', 'r', encoding='utf-8', errors='ignore').read()
# Remove scripts and styles
content = re.sub(r'<script[^>]*>.*?</script>', '', content, flags=re.DOTALL)
content = re.sub(r'<style[^>]*>.*?</style>', '', content, flags=re.DOTALL)
# Extract title
title_match = re.search(r'<title>([^<]+)</title>', content)
title = html.unescape(title_match.group(1)) if title_match else "$BASENAME"
# Get body text
body_match = re.search(r'<body[^>]*>(.*?)</body>', content, re.DOTALL)
if body_match:
body = body_match.group(1)
body = re.sub(r'<[^>]+>', ' ', body)
body = html.unescape(body)
body = re.sub(r'\s+', ' ', body).strip()
else:
body = "(could not extract body)"
with open('$OUTPUT/posts/$BASENAME.md', 'w') as f:
f.write(f"# {title}\\n\\n")
f.write(f"Source: Wayback Machine\\n\\n")
f.write(f"---\\n\\n")
f.write(body[:5000]) # Limit length
f.write("\\n")
print(f" $BASENAME.md")
PYEOF
done
;;
medium)
echo "Processing Medium files..."
# Handle RSS feed
for file in "$DOWNLOADS"/medium-*-feed.xml; do
[ -f "$file" ] || continue
echo " Processing RSS: $(basename "$file")"
python3 << PYEOF
import xml.etree.ElementTree as ET
import html
import re
tree = ET.parse('$file')
root = tree.getroot()
channel = root.find('channel')
items = channel.findall('item') if channel else root.findall('.//item')
for i, item in enumerate(items):
title = item.findtext('title', 'Untitled')
author = item.findtext('{http://purl.org/dc/elements/1.1/}creator', 'Unknown')
date = item.findtext('pubDate', '')
content = item.findtext('{http://purl.org/rss/1.0/modules/content/}encoded', '')
# Clean content
content = re.sub(r'<[^>]+>', '', content)
content = html.unescape(content)
filename = f'$OUTPUT/posts/MEDIUM-{i+1:04d}.md'
with open(filename, 'w') as f:
f.write(f"# {title}\\n\\n")
f.write(f"| Author | {author} |\\n")
f.write(f"|--------|----------|\\n")
f.write(f"| Date | {date} |\\n\\n")
f.write(f"---\\n\\n")
f.write(content[:10000])
f.write("\\n")
print(f" MEDIUM-{i+1:04d}.md - {title[:40]}...")
PYEOF
done
;;
*)
echo "ERROR: Unknown source '$SOURCE'"
echo "Supported: bitcointalk, reddit, wayback, medium"
exit 1
;;
esac
echo ""
echo "=== Processing Complete ==="
echo "Output: $OUTPUT/"

View file

@ -30,32 +30,31 @@ Comprehensive collection of distributed ledger, cryptographic protocol, and dece
| oracles | 3 | Chainlink, Band Protocol |
| bridges | 3 | Atomic swaps, XCLAIM, THORChain |
## Usage
## Collection with Borg
```bash
# All papers (91+)
./discover.sh --all > jobs.txt
# Collect papers from academic sources
borg collect website https://eprint.iacr.org --depth 2 --format stim -o iacr-papers.stim
# By category
./discover.sh --category=cryptography > jobs.txt
./discover.sh --category=defi > jobs.txt
# Collect from arXiv
borg collect website https://arxiv.org/list/cs.CR/recent --depth 1
# By topic
./discover.sh --topic=bulletproofs > jobs.txt
./discover.sh --topic=zk-snarks > jobs.txt
# IACR search for more
./discover.sh --search-iacr > search-jobs.txt
# List categories
./discover.sh --help
# Package existing archive
borg compile -f Borgfile -e "archive-password" -o ledger-papers.stim
```
## Output Format
## Registry
```
URL|FILENAME|TYPE|METADATA
https://bitcoin.org/bitcoin.pdf|bitcoin.pdf|paper|category=genesis,title=Bitcoin...
Papers are catalogued in `registry.json`:
```json
{
"id": "paper-id",
"title": "Paper Title",
"year": 2024,
"url": "https://example.com/paper.pdf",
"topics": ["topic1", "topic2"]
}
```
## CDN Hosting Structure
@ -69,32 +68,14 @@ papers.lethean.io/
├── cryptonote/
│ ├── cryptonote-v2.pdf
│ └── cns/
│ ├── cns001.txt
│ └── ...
├── mrl/
│ ├── MRL-0001.pdf
│ └── ...
│ └── MRL-0001.pdf
├── cryptography/
│ ├── bulletproofs.pdf
│ ├── clsag.pdf
│ └── ...
│ └── clsag.pdf
└── INDEX.json
```
## Adding Papers
Edit `registry.json`:
```json
{
"id": "paper-id",
"title": "Paper Title",
"year": 2024,
"url": "https://example.com/paper.pdf",
"topics": ["topic1", "topic2"]
}
```
## License Note
Papers collected for archival/educational purposes. Original copyrights remain with authors. CDN hosting as community service under CIC principles.
Papers collected for archival/educational purposes. Original copyrights remain with authors.

View file

@ -1,132 +0,0 @@
#!/usr/bin/env bash
# Discover CryptoNote extension papers
# Usage: ./discover.sh [--all] [--category=NAME] [--project=NAME] [--topic=NAME]
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REGISTRY="$SCRIPT_DIR/registry.json"
# Check for jq
if ! command -v jq &> /dev/null; then
echo "Error: jq is required" >&2
exit 1
fi
CATEGORY=""
PROJECT=""
TOPIC=""
ALL=0
# Parse args
for arg in "$@"; do
case "$arg" in
--all) ALL=1 ;;
--category=*) CATEGORY="${arg#*=}" ;;
--project=*) PROJECT="${arg#*=}" ;;
--topic=*) TOPIC="${arg#*=}" ;;
--search-iacr) SEARCH_IACR=1 ;;
--help|-h)
echo "Usage: $0 [options]"
echo ""
echo "Options:"
echo " --all All known papers"
echo " --category=NAME Filter by category (mrl, iacr, projects, attacks)"
echo " --project=NAME Filter by project (monero, haven, masari, etc)"
echo " --topic=NAME Filter by topic (bulletproofs, ringct, etc)"
echo " --search-iacr Generate IACR search jobs"
echo ""
echo "Categories:"
jq -r '.categories | keys[]' "$REGISTRY"
exit 0
;;
esac
done
echo "# Ledger Papers Archive - $(date +%Y-%m-%d)"
echo "# Format: URL|FILENAME|TYPE|METADATA"
echo "#"
emit_paper() {
local url="$1"
local id="$2"
local category="$3"
local title="$4"
local filename="${id}.pdf"
local metadata="category=$category,title=$title"
echo "${url}|${filename}|paper|${metadata}"
}
# Process categories
process_category() {
local cat_name="$1"
echo "# === $cat_name ==="
# Get papers in category
local papers
papers=$(jq -c ".categories[\"$cat_name\"].papers[]?" "$REGISTRY" 2>/dev/null)
echo "$papers" | while read -r paper; do
[ -z "$paper" ] && continue
local id title url urls
id=$(echo "$paper" | jq -r '.id')
title=$(echo "$paper" | jq -r '.title // "Unknown"')
# Check topic filter
if [ -n "$TOPIC" ]; then
if ! echo "$paper" | jq -e ".topics[]? | select(. == \"$TOPIC\")" > /dev/null 2>&1; then
continue
fi
fi
# Check project filter
if [ -n "$PROJECT" ]; then
local paper_project
paper_project=$(echo "$paper" | jq -r '.project // ""')
if [ "$paper_project" != "$PROJECT" ]; then
continue
fi
fi
# Get URL (single or first from array)
url=$(echo "$paper" | jq -r '.url // .urls[0] // ""')
if [ -n "$url" ]; then
emit_paper "$url" "$id" "$cat_name" "$title"
fi
# Also emit alternate URLs for wayback
urls=$(echo "$paper" | jq -r '.urls[]? // empty' 2>/dev/null)
echo "$urls" | while read -r alt_url; do
[ -z "$alt_url" ] && continue
[ "$alt_url" = "$url" ] && continue
echo "# alt: $alt_url"
done
done
echo "#"
}
# Main logic
if [ "$ALL" = "1" ] || [ -z "$CATEGORY" ]; then
# All categories - dynamically from registry
jq -r '.categories | keys[]' "$REGISTRY" | while read -r cat; do
process_category "$cat"
done
else
# Single category
process_category "$CATEGORY"
fi
# IACR search jobs
if [ "$SEARCH_IACR" = "1" ]; then
echo "# === IACR Search Jobs ==="
jq -r '.search_patterns.iacr[]' "$REGISTRY" | while read -r term; do
encoded=$(echo "$term" | sed 's/ /+/g')
echo "https://eprint.iacr.org/search?q=${encoded}|iacr-search-${encoded}.html|search|source=iacr,term=$term"
done
fi

View file

@ -1,105 +0,0 @@
#!/usr/bin/env bash
# Generate mining pool collection jobs
# Usage: ./generate-jobs.sh <coin> [--url=URL] [--all]
set -e
COIN=""
POOL_URL=""
ALL_POOLS=0
# Known pools registry
declare -A POOLS_LETHEAN=(
["herominers"]="https://lethean.herominers.com"
["gntl"]="https://lthn.pool.gntl.uk"
)
declare -A POOLS_MONERO=(
["supportxmr"]="https://supportxmr.com"
["nanopool"]="https://xmr.nanopool.org"
["hashvault"]="https://monero.hashvault.pro"
)
declare -A POOLS_WOWNERO=(
["herominers"]="https://wownero.herominers.com"
)
# Parse args
for arg in "$@"; do
case "$arg" in
--url=*) POOL_URL="${arg#*=}" ;;
--all) ALL_POOLS=1 ;;
--*) ;;
*) COIN="$arg" ;;
esac
done
emit_pool_jobs() {
local pool_name="$1"
local pool_url="$2"
local coin="$3"
local slug="${coin}-${pool_name}"
echo "# === ${pool_name} (${coin}) ==="
# Common nodejs-pool API endpoints
echo "${pool_url}/api/stats|pool-${slug}-stats.json|pool-api|coin=$coin,pool=$pool_name"
echo "${pool_url}/api/pool/blocks|pool-${slug}-blocks.json|pool-api|coin=$coin,pool=$pool_name"
echo "${pool_url}/api/pool/payments|pool-${slug}-payments.json|pool-api|coin=$coin,pool=$pool_name"
echo "${pool_url}/api/network/stats|pool-${slug}-network.json|pool-api|coin=$coin,pool=$pool_name"
echo "${pool_url}/api/config|pool-${slug}-config.json|pool-api|coin=$coin,pool=$pool_name"
# Web pages
echo "${pool_url}/|pool-${slug}-home.html|pool-web|coin=$coin,pool=$pool_name"
echo "${pool_url}/#/blocks|pool-${slug}-blocks-page.html|pool-web|coin=$coin,pool=$pool_name"
echo "#"
}
echo "# Mining Pool Jobs - $(date +%Y-%m-%d)"
echo "# Format: URL|FILENAME|TYPE|METADATA"
echo "#"
if [ "$ALL_POOLS" = "1" ]; then
for pool in "${!POOLS_LETHEAN[@]}"; do
emit_pool_jobs "$pool" "${POOLS_LETHEAN[$pool]}" "lethean"
done
for pool in "${!POOLS_MONERO[@]}"; do
emit_pool_jobs "$pool" "${POOLS_MONERO[$pool]}" "monero"
done
for pool in "${!POOLS_WOWNERO[@]}"; do
emit_pool_jobs "$pool" "${POOLS_WOWNERO[$pool]}" "wownero"
done
elif [ -n "$POOL_URL" ]; then
pool_name=$(echo "$POOL_URL" | sed 's|.*://||; s|/.*||; s|\..*||')
emit_pool_jobs "$pool_name" "$POOL_URL" "${COIN:-unknown}"
elif [ -n "$COIN" ]; then
case "$COIN" in
lethean|lthn)
for pool in "${!POOLS_LETHEAN[@]}"; do
emit_pool_jobs "$pool" "${POOLS_LETHEAN[$pool]}" "lethean"
done
;;
monero|xmr)
for pool in "${!POOLS_MONERO[@]}"; do
emit_pool_jobs "$pool" "${POOLS_MONERO[$pool]}" "monero"
done
;;
wownero|wow)
for pool in "${!POOLS_WOWNERO[@]}"; do
emit_pool_jobs "$pool" "${POOLS_WOWNERO[$pool]}" "wownero"
done
;;
*)
echo "# Unknown coin: $COIN" >&2
echo "# Use --url= to specify pool URL" >&2
exit 1
;;
esac
else
echo "Usage: $0 <coin> [--url=URL] [--all]" >&2
echo "" >&2
echo "Known coins: lethean, monero, wownero" >&2
exit 1
fi

View file

@ -11,54 +11,40 @@ When a CryptoNote project dies, its artifacts scatter:
- Block explorers shut down
- Discord servers empty out
This skill orchestrates a **full dig** on a dead project — running all collectors in sequence to preserve everything salvageable before it's gone forever.
This skill orchestrates a **full dig** using Borg to preserve everything salvageable.
## Usage
## Collection with Borg
```bash
# Full excavation of a project
./excavate.sh masari
# Clone all repos from a dying project
borg collect github repos masari-project --format stim -o masari-github.stim
# Quick scan (just check what's still accessible)
./excavate.sh masari --scan-only
# Archive the website via Wayback
borg collect website https://web.archive.org/web/*/getmasari.org --depth 3
# Specific collectors only
./excavate.sh masari --only=github,bitcointalk
# Resume interrupted dig
./excavate.sh masari --resume
# Package everything into encrypted archive
borg compile -f Borgfile -e "archive-password" -o masari-full-dig.stim
```
## What Gets Collected
| Source | Collector Used | Priority |
|--------|----------------|----------|
| GitHub repos | `github-history` | P1 - often deleted first |
| GitHub releases | `wallet-releases` | P1 - binaries disappear |
| BitcoinTalk ANN | `bitcointalk` | P2 - usually persists |
| Website (Wayback) | `job-collector wayback` | P2 - snapshots exist |
| Block explorer | `block-explorer` | P3 - chain data |
| CoinMarketCap | `coinmarketcap` | P3 - historical prices |
| Whitepapers | `whitepaper-archive` | P1 - research value |
| Reddit | `job-collector reddit` | P4 - community context |
| Medium posts | `job-collector medium` | P4 - announcements |
| Source | Borg Command | Priority |
|--------|--------------|----------|
| GitHub repos | `borg collect github repos <org>` | P1 |
| GitHub releases | `borg collect github repo <url>` | P1 |
| Websites | `borg collect website <url>` | P2 |
| Wayback snapshots | `borg collect website web.archive.org/...` | P2 |
## Output Structure
```
digs/
└── <project-name>/
├── EXCAVATION.md # Dig log with timestamps
├── SALVAGE-REPORT.md # What's worth keeping
├── LESSONS.md # What killed it, what we learned
├── github/ # All repo history
├── releases/ # Wallet binaries, checksums
├── bitcointalk/ # Thread archive
├── website/ # Wayback snapshots
├── explorer/ # Chain data samples
├── market/ # Price history, volume
├── papers/ # Whitepapers, docs
└── community/ # Reddit, Medium, etc
├── github.stim # All repo history (encrypted)
├── website.stim # Website snapshots (encrypted)
└── papers/ # Whitepapers, docs
```
## Report Templates
@ -69,32 +55,15 @@ What code/ideas are worth extracting:
- Wallet features
- Mining algorithms
- Community tools
- Documentation patterns
### LESSONS.md
Post-mortem analysis:
- Timeline of decline
- Root causes (dev burnout, drama, funding, tech debt)
- Warning signs to watch for
- What could have saved it
## Integration with cryptonote-discovery
```bash
# Get list of abandoned projects
cd ../cryptonote-discovery
./discover.sh --list-abandoned
# Excavate all abandoned projects (batch mode)
for proj in $(./discover.sh --list-abandoned); do
../project-archaeology/excavate.sh "$proj"
done
```
## Known Dig Sites
Projects confirmed dead/dying that need excavation:
| Project | Symbol | Death Year | Urgency | Notes |
|---------|--------|------------|---------|-------|
| TurtleCoin | TRTL | 2023 | HIGH | Team burned out, great docs |
@ -104,22 +73,14 @@ Projects confirmed dead/dying that need excavation:
| Sumokoin | SUMO | 2021 | LOW | Drama-killed, large ring research |
| Ryo | RYO | 2023 | LOW | GPU algo work |
## Requirements
## Batch Excavation
- All collector skills installed
- `gh` CLI authenticated
- `jq` installed
- Sufficient disk space for archives
- Patience (full dig can take hours)
## Adding New Dig Sites
When you discover a dead CryptoNote project:
1. Add to `../cryptonote-discovery/registry.json`
2. Include `"salvageable": [...]` field
3. Run `./excavate.sh <project> --scan-only` first
4. If sources still accessible, run full dig
```bash
# Collect multiple dead projects
for org in turtlecoin masari-project aeonix nerva-project; do
borg collect github repos "$org" --format stim -o "digs/${org}.stim"
done
```
---

View file

@ -1,311 +0,0 @@
#!/bin/bash
# Project Archaeology - Deep excavation of abandoned CryptoNote projects
# Usage: ./excavate.sh <project-name> [options]
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
SKILLS_DIR="$(dirname "$SCRIPT_DIR")"
REGISTRY="$SKILLS_DIR/cryptonote-discovery/registry.json"
OUTPUT_DIR="$SCRIPT_DIR/digs"
# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'
# Defaults
SCAN_ONLY=false
RESUME=false
ONLY_COLLECTORS=""
usage() {
echo "Usage: $0 <project-name> [options]"
echo ""
echo "Options:"
echo " --scan-only Check what's accessible without downloading"
echo " --resume Resume interrupted excavation"
echo " --only=a,b,c Run specific collectors only"
echo " --help Show this help"
echo ""
echo "Examples:"
echo " $0 masari # Full excavation"
echo " $0 masari --scan-only # Quick accessibility check"
echo " $0 masari --only=github,btt # GitHub and BitcoinTalk only"
exit 1
}
log() {
echo -e "${BLUE}[$(date '+%H:%M:%S')]${NC} $1"
}
success() {
echo -e "${GREEN}[✓]${NC} $1"
}
warn() {
echo -e "${YELLOW}[!]${NC} $1"
}
error() {
echo -e "${RED}[✗]${NC} $1"
}
# Get project data from registry
get_project() {
local name="$1"
jq -r --arg n "$name" '.projects[] | select(.name | ascii_downcase == ($n | ascii_downcase))' "$REGISTRY"
}
# Check if a collector should run
should_run() {
local collector="$1"
if [ -z "$ONLY_COLLECTORS" ]; then
return 0
fi
echo "$ONLY_COLLECTORS" | grep -q "$collector"
}
# Scan a URL to check if accessible
check_url() {
local url="$1"
local status=$(curl -s -o /dev/null -w "%{http_code}" --max-time 10 "$url" 2>/dev/null || echo "000")
if [ "$status" = "200" ] || [ "$status" = "301" ] || [ "$status" = "302" ]; then
return 0
fi
return 1
}
# Main excavation function
excavate() {
local project_name="$1"
local project=$(get_project "$project_name")
if [ -z "$project" ] || [ "$project" = "null" ]; then
error "Project '$project_name' not found in registry"
echo "Add it to: $REGISTRY"
exit 1
fi
# Extract project data
local name=$(echo "$project" | jq -r '.name')
local symbol=$(echo "$project" | jq -r '.symbol')
local status=$(echo "$project" | jq -r '.status')
local github_orgs=$(echo "$project" | jq -r '.github[]?' 2>/dev/null)
local btt_topic=$(echo "$project" | jq -r '.bitcointalk // empty')
local website=$(echo "$project" | jq -r '.website // empty')
local explorer=$(echo "$project" | jq -r '.explorer // empty')
local cmc=$(echo "$project" | jq -r '.cmc // empty')
echo ""
echo -e "${BLUE}════════════════════════════════════════════════════════════${NC}"
echo -e "${BLUE} PROJECT ARCHAEOLOGY: ${name} (${symbol})${NC}"
echo -e "${BLUE} Status: ${status}${NC}"
echo -e "${BLUE}════════════════════════════════════════════════════════════${NC}"
echo ""
# Create output directory
local dig_dir="$OUTPUT_DIR/$project_name"
mkdir -p "$dig_dir"/{github,releases,bitcointalk,website,explorer,market,papers,community}
# Start excavation log
local log_file="$dig_dir/EXCAVATION.md"
echo "# Excavation Log: $name ($symbol)" > "$log_file"
echo "" >> "$log_file"
echo "**Started:** $(date)" >> "$log_file"
echo "**Status at dig time:** $status" >> "$log_file"
echo "" >> "$log_file"
echo "---" >> "$log_file"
echo "" >> "$log_file"
# Phase 1: GitHub (highest priority - often deleted first)
if should_run "github"; then
echo "## GitHub Repositories" >> "$log_file"
echo "" >> "$log_file"
for org in $github_orgs; do
log "Checking GitHub org: $org"
if $SCAN_ONLY; then
if check_url "https://github.com/$org"; then
success "GitHub org accessible: $org"
echo "- [x] \`$org\` - accessible" >> "$log_file"
else
warn "GitHub org NOT accessible: $org"
echo "- [ ] \`$org\` - NOT accessible" >> "$log_file"
fi
else
log "Running github-history collector on $org..."
# Would call: $SKILLS_DIR/github-history/collect.sh "https://github.com/$org" --org
echo "- Collected: \`$org\`" >> "$log_file"
fi
done
echo "" >> "$log_file"
fi
# Phase 2: BitcoinTalk
if should_run "btt" || should_run "bitcointalk"; then
echo "## BitcoinTalk Thread" >> "$log_file"
echo "" >> "$log_file"
if [ -n "$btt_topic" ]; then
local btt_url="https://bitcointalk.org/index.php?topic=$btt_topic"
log "Checking BitcoinTalk topic: $btt_topic"
if $SCAN_ONLY; then
if check_url "$btt_url"; then
success "BitcoinTalk thread accessible"
echo "- [x] Topic $btt_topic - accessible" >> "$log_file"
else
warn "BitcoinTalk thread NOT accessible"
echo "- [ ] Topic $btt_topic - NOT accessible" >> "$log_file"
fi
else
log "Running bitcointalk collector..."
# Would call: $SKILLS_DIR/bitcointalk/collect.sh "$btt_topic"
echo "- Collected: Topic $btt_topic" >> "$log_file"
fi
else
warn "No BitcoinTalk topic ID in registry"
echo "- [ ] No topic ID recorded" >> "$log_file"
fi
echo "" >> "$log_file"
fi
# Phase 3: Website via Wayback
if should_run "wayback" || should_run "website"; then
echo "## Website (Wayback Machine)" >> "$log_file"
echo "" >> "$log_file"
if [ -n "$website" ]; then
log "Checking Wayback Machine for: $website"
local wayback_api="https://archive.org/wayback/available?url=$website"
if $SCAN_ONLY; then
local wayback_check=$(curl -s "$wayback_api" | jq -r '.archived_snapshots.closest.available // "false"')
if [ "$wayback_check" = "true" ]; then
success "Wayback snapshots available for $website"
echo "- [x] \`$website\` - snapshots available" >> "$log_file"
else
warn "No Wayback snapshots for $website"
echo "- [ ] \`$website\` - no snapshots" >> "$log_file"
fi
else
log "Running wayback collector..."
# Would call: $SKILLS_DIR/job-collector/generate-jobs.sh wayback "$website"
echo "- Collected: \`$website\`" >> "$log_file"
fi
else
warn "No website in registry"
echo "- [ ] No website recorded" >> "$log_file"
fi
echo "" >> "$log_file"
fi
# Phase 4: Block Explorer
if should_run "explorer"; then
echo "## Block Explorer" >> "$log_file"
echo "" >> "$log_file"
if [ -n "$explorer" ]; then
log "Checking block explorer: $explorer"
if $SCAN_ONLY; then
if check_url "https://$explorer"; then
success "Block explorer online: $explorer"
echo "- [x] \`$explorer\` - online" >> "$log_file"
else
warn "Block explorer OFFLINE: $explorer"
echo "- [ ] \`$explorer\` - OFFLINE" >> "$log_file"
fi
else
log "Running block-explorer collector..."
echo "- Collected: \`$explorer\`" >> "$log_file"
fi
else
warn "No explorer in registry"
echo "- [ ] No explorer recorded" >> "$log_file"
fi
echo "" >> "$log_file"
fi
# Phase 5: Market Data (CMC)
if should_run "cmc" || should_run "market"; then
echo "## Market Data" >> "$log_file"
echo "" >> "$log_file"
if [ -n "$cmc" ]; then
log "Checking CoinMarketCap: $cmc"
if $SCAN_ONLY; then
if check_url "https://coinmarketcap.com/currencies/$cmc/"; then
success "CMC page exists: $cmc"
echo "- [x] CMC: \`$cmc\` - exists" >> "$log_file"
else
warn "CMC page NOT found: $cmc"
echo "- [ ] CMC: \`$cmc\` - not found" >> "$log_file"
fi
else
log "Running coinmarketcap collector..."
echo "- Collected: \`$cmc\`" >> "$log_file"
fi
else
warn "No CMC slug in registry"
echo "- [ ] No CMC slug recorded" >> "$log_file"
fi
echo "" >> "$log_file"
fi
# Finalize log
echo "---" >> "$log_file"
echo "" >> "$log_file"
echo "**Completed:** $(date)" >> "$log_file"
if $SCAN_ONLY; then
echo ""
success "Scan complete. See: $log_file"
else
echo ""
success "Excavation complete. Output in: $dig_dir"
echo ""
log "Next steps:"
echo " 1. Review: $log_file"
echo " 2. Generate: $dig_dir/SALVAGE-REPORT.md"
echo " 3. Write: $dig_dir/LESSONS.md"
fi
}
# Parse arguments
if [ $# -lt 1 ]; then
usage
fi
PROJECT="$1"
shift
while [ $# -gt 0 ]; do
case "$1" in
--scan-only)
SCAN_ONLY=true
;;
--resume)
RESUME=true
;;
--only=*)
ONLY_COLLECTORS="${1#*=}"
;;
--help)
usage
;;
*)
error "Unknown option: $1"
usage
;;
esac
shift
done
# Run excavation
excavate "$PROJECT"

View file

@ -1,38 +0,0 @@
#!/usr/bin/env bash
# Hook: update-index.sh
# Called after collection completes to update indexes
WHITEPAPERS_DIR="${1:-./whitepapers}"
echo "[update-index] Updating whitepaper index..."
# Count papers in each category
for category in cryptonote lethean research uncategorized; do
dir="$WHITEPAPERS_DIR/$category"
if [ -d "$dir" ]; then
count=$(find "$dir" -name "*.pdf" 2>/dev/null | wc -l | tr -d ' ')
echo " $category: $count papers"
fi
done
# Update INDEX.md with collected papers
INDEX="$WHITEPAPERS_DIR/INDEX.md"
if [ -f "$INDEX" ]; then
# Add collected papers section if not exists
if ! grep -q "## Recently Collected" "$INDEX"; then
echo "" >> "$INDEX"
echo "## Recently Collected" >> "$INDEX"
echo "" >> "$INDEX"
echo "_Last updated: $(date +%Y-%m-%d)_" >> "$INDEX"
echo "" >> "$INDEX"
fi
fi
# Process pending jobs
PENDING="$WHITEPAPERS_DIR/.pending-jobs.txt"
if [ -f "$PENDING" ]; then
count=$(wc -l < "$PENDING" | tr -d ' ')
echo "[update-index] $count papers queued for collection"
fi
echo "[update-index] Done"