agent/claude/skills/job-collector/generate-jobs.sh
Snider 9f1950c231 refactor: unify claude-cowork into claude directory
Consolidates the repository structure:
- Move skills/ to claude/skills/
- Move collection hooks to claude/collection/
- Update CLAUDE.md to reflect unified structure

Public version remains at core-claude; this is the advanced in-house version.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-01 18:23:41 +00:00

107 lines
3.6 KiB
Bash

#!/usr/bin/env bash
# Generate job list for proxy-based collection
# Usage: ./generate-jobs.sh <source> <target> [options] > jobs.txt
set -e
SOURCE="$1"
TARGET="$2"
shift 2 || true
# Defaults
LIMIT=1000
PAGES=100
# Parse options
for arg in "$@"; do
case "$arg" in
--limit=*) LIMIT="${arg#*=}" ;;
--pages=*) PAGES="${arg#*=}" ;;
esac
done
# Output header
echo "# Job list generated $(date +%Y-%m-%d\ %H:%M)"
echo "# Source: $SOURCE | Target: $TARGET"
echo "# Format: URL|FILENAME|TYPE|METADATA"
echo "#"
case "$SOURCE" in
bitcointalk|btt)
# Extract topic ID
TOPIC_ID=$(echo "$TARGET" | grep -oE '[0-9]+' | head -1)
echo "# BitcoinTalk topic: $TOPIC_ID"
echo "#"
# Generate page URLs (20 posts per page)
for ((i=0; i<PAGES*20; i+=20)); do
echo "https://bitcointalk.org/index.php?topic=${TOPIC_ID}.${i}|btt-${TOPIC_ID}-p${i}.html|bitcointalk|page=$((i/20)),offset=$i"
done
;;
reddit)
# Handle r/subreddit or full URL
SUBREDDIT=$(echo "$TARGET" | sed 's|.*/r/||' | sed 's|/.*||')
echo "# Reddit: r/$SUBREDDIT"
echo "#"
# Subreddit pages (top, new, hot)
for sort in "top" "new" "hot"; do
echo "https://old.reddit.com/r/${SUBREDDIT}/${sort}/.json?limit=100|reddit-${SUBREDDIT}-${sort}.json|reddit|sort=$sort"
done
# If it's a specific thread
if [[ "$TARGET" =~ comments/([a-z0-9]+) ]]; then
THREAD_ID="${BASH_REMATCH[1]}"
echo "https://old.reddit.com/r/${SUBREDDIT}/comments/${THREAD_ID}.json|reddit-thread-${THREAD_ID}.json|reddit|thread=$THREAD_ID"
fi
;;
wayback|archive)
# Clean domain
DOMAIN=$(echo "$TARGET" | sed 's|https\?://||' | sed 's|/.*||')
echo "# Wayback Machine: $DOMAIN"
echo "#"
# CDX API to get all snapshots
echo "https://web.archive.org/cdx/search/cdx?url=${DOMAIN}/*&output=json&limit=${LIMIT}|wayback-${DOMAIN}-cdx.json|wayback-index|domain=$DOMAIN"
# Common important pages
for path in "" "index.html" "about" "roadmap" "team" "whitepaper" "faq"; do
echo "https://web.archive.org/web/2020/${DOMAIN}/${path}|wayback-${DOMAIN}-2020-${path:-index}.html|wayback|year=2020,path=$path"
echo "https://web.archive.org/web/2021/${DOMAIN}/${path}|wayback-${DOMAIN}-2021-${path:-index}.html|wayback|year=2021,path=$path"
echo "https://web.archive.org/web/2022/${DOMAIN}/${path}|wayback-${DOMAIN}-2022-${path:-index}.html|wayback|year=2022,path=$path"
done
;;
medium)
# Handle @author or publication
AUTHOR=$(echo "$TARGET" | sed 's|.*/||' | sed 's|^@||')
echo "# Medium: @$AUTHOR"
echo "#"
# Medium RSS feed (easier to parse)
echo "https://medium.com/feed/@${AUTHOR}|medium-${AUTHOR}-feed.xml|medium-rss|author=$AUTHOR"
# Profile page
echo "https://medium.com/@${AUTHOR}|medium-${AUTHOR}-profile.html|medium|author=$AUTHOR"
;;
twitter|x)
USERNAME=$(echo "$TARGET" | sed 's|.*/||' | sed 's|^@||')
echo "# Twitter/X: @$USERNAME"
echo "# Note: Twitter requires auth - use nitter or API"
echo "#"
# Nitter instances (public, no auth)
echo "https://nitter.net/${USERNAME}|twitter-${USERNAME}.html|nitter|user=$USERNAME"
echo "https://nitter.net/${USERNAME}/with_replies|twitter-${USERNAME}-replies.html|nitter|user=$USERNAME,type=replies"
;;
*)
echo "# ERROR: Unknown source '$SOURCE'" >&2
echo "# Supported: bitcointalk, reddit, wayback, medium, twitter" >&2
exit 1
;;
esac