goonpedo crawl script

🧩 Syntax:
#!/usr/bin/env python3

"""
YouTube Channel Crawler and Keyword/Regex Matcher

Usage:
------
python yt_pedo_finder_v1.0.py <channel_url_or_id> --api-key <YOUR_API_KEY> [options]

Positional Arguments:
  channel             Starting YouTube channel URL or ID

Required Arguments:
  --api-key           Your YouTube Data API v3 key

Optional Arguments:
  --max-depth         Maximum recursion depth for crawling channels (default: 2)
  --max-videos        Maximum number of videos to fetch per channel (default: 5)

Features:
---------
- Recursively crawl a starting channel and its commenters’ channels
- Match keywords/regexes in:
    * Channel name
    * Channel description
    * Top-level comments on videos
- Records matched channels and matched text
- Generates optional reports:
    * CSV
    * HTML 
- Includes channels without matches in reports
- Matches are case-insensitive

Examples:
---------
# Crawl a channel and get all matches (with default depth and videos)
python yt_pedo_finder_v1.0.py https://www.youtube.com/@example --api-key YOUR_KEY

# Crawl a channel with depth 3 and 10 videos per channel
python yt_pedo_finder_v1.0.py UCabcd1234 --api-key YOUR_KEY --max-depth 3 --max-videos 10
"""



import re
import sys
import csv
import argparse
from collections import deque, defaultdict
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError

# --- CONFIG ---
KEYWORDS = [
    r"(?i)\bsession\b",         # session
    r"\b[a-fA-F0-9]{66}\b",     # Matches session IDs
    r"(?i)\bteleguard\b",       # teleguard
    r"(?i)\bmaps?\b",           # map / MAP(s)
    r"(?i)\bp[ox]do\w*",        # pedo, pxdos, pedophilia, pxdocon, etc.
    r"(?i)\bl[ox]li\w*",        # loli, lxli, lolicon, lxlicon, etc.
    r"(?i)\bminors?\b",         # minor(s)
    r"(?i)\bch[ix]ld(ren)?\b",  # (child / chxld) + (ren)?
    r"(?i)\bunderage\b",        # underage
    r"(?i)\bageplay\b",         # ageplay
    r"(?i)\bgr[ox][ox]m\w*",    # groom, groomer, grxxm, grxxming, etc.
    r"(?i)\bg[ox][ox]n\w*",     # goon, gooner, gxxn, gxxning, etc.
]
REGEXES = [re.compile(pat, re.I) for pat in KEYWORDS]
# ---------------

visited = set()
# matched[channel_id] = {"name":..., "link":..., "matches":[{"type":..., "text":..., "link":...}]}
matched = defaultdict(lambda: {"name": "", "link": "", "matches": []})

def matches(text: str):
    return [r.pattern for r in REGEXES if r.search(text or "")]

def get_channel_id(youtube, url_or_id):
    if url_or_id.startswith("UC"):
        return url_or_id
    if "/@" in url_or_id:
        handle = url_or_id.split("/@")[1]
        resp = youtube.search().list(
            part="snippet",
            q=f"@{handle}",
            type="channel",
            maxResults=1
        ).execute()
        if resp["items"]:
            return resp["items"][0]["snippet"]["channelId"]
    if "channel/" in url_or_id:
        return url_or_id.split("channel/")[1]
    raise ValueError(f"Could not resolve channel: {url_or_id}")

def get_channel_info(youtube, channel_id):
    try:
        resp = youtube.channels().list(
            part="snippet,contentDetails,statistics",
            id=channel_id
        ).execute()
    except HttpError as e:
        print(f"[!] Failed to fetch info for {channel_id}: {e}")
        return None
    items = resp.get("items", [])
    if not items:
        return None
    item = items[0]
    return {
        "name": item["snippet"].get("title", ""),
        "description": item["snippet"].get("description", ""),
        "uploads": item["contentDetails"]["relatedPlaylists"].get("uploads"),
        "video_count": int(item.get("statistics", {}).get("videoCount", 0)),
        "subs": int(item.get("statistics", {}).get("subscriberCount", 0)),
        "link": f"https://www.youtube.com/channel/{channel_id}"
    }

def get_videos(youtube, playlist_id, max_videos=5):
    videos = []
    if not playlist_id:
        return videos
    try:
        req = youtube.playlistItems().list(
            part="contentDetails",
            playlistId=playlist_id,
            maxResults=max_videos
        )
        resp = req.execute()
    except HttpError as e:
        print(f"[!] Failed to get videos for playlist {playlist_id}: {e}")
        return videos
    for item in resp.get("items", []):
        videos.append(item["contentDetails"]["videoId"])
    return videos

def get_commenters(youtube, video_id, max_comments=50):
    commenters = []
    try:
        req = youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            maxResults=max_comments,
            textFormat="plainText"
        )
        resp = req.execute()
    except HttpError as e:
        print(f"[!] Cannot fetch comments for {video_id}: {e}")
        return commenters
    for item in resp.get("items", []):
        snippet = item["snippet"]["topLevelComment"]["snippet"]
        text = snippet.get("textDisplay", "")
        author_id = snippet.get("authorChannelId", {}).get("value")
        if author_id:
            commenters.append((author_id, text, f"https://www.youtube.com/watch?v={video_id}&lc={item['id']}"))
    return commenters

def crawl(youtube, start_channel, max_depth=2, max_videos=5):
    queue = deque([(start_channel, 0)])
    while queue:
        channel_id, depth = queue.popleft()
        if channel_id in visited or depth > max_depth:
            continue
        visited.add(channel_id)

        info = get_channel_info(youtube, channel_id)
        if not info:
            # If we can't fetch info, still add placeholder name
            if channel_id not in matched:
                matched[channel_id]["name"] = channel_id
                matched[channel_id]["link"] = f"https://www.youtube.com/channel/{channel_id}"
            continue

        print(f"[+] Visiting {channel_id} ({info['name']}) at depth {depth}")
        print(f"    Subscribers: {info['subs']}, Videos: {info['video_count']}")

        # Ensure matched entry has name and link
        matched[channel_id]["name"] = info["name"]
        matched[channel_id]["link"] = info["link"]

        # Check channel name
        if matches(info["name"]):
            print(f"  -> MATCHED name: {info['name']}")
            matched[channel_id]["matches"].append({
                "type": "name",
                "text": info["name"],
                "link": info["link"]
            })

        # --- Check description once per channel ---
        if matches(info["description"]):
            print(f"  -> MATCHED description: {info['description'][:100]}...")
            matched[channel_id]["matches"].append({
                "type": "description",
                "text": info["description"],
                "link": info["link"]
            })

        uploads = info.get("uploads")
        if not uploads or info["video_count"] == 0:
            continue

        videos = get_videos(youtube, uploads, max_videos=max_videos)
        for vid in videos:
            commenters = get_commenters(youtube, vid)
            for author_id, text, link in commenters:
                if matches(text):
                    # Attempt to get name if not already present
                    if not matched[author_id]["name"]:
                        author_info = get_channel_info(youtube, author_id)
                        matched[author_id]["name"] = author_info["name"] if author_info else author_id
                        matched[author_id]["link"] = f"https://www.youtube.com/channel/{author_id}"
                    print(f"  -> MATCHED comment in {author_id}: {text[:100]}...")
                    matched[author_id]["matches"].append({
                        "type": "comment",
                        "text": text,
                        "link": link
                    })
                queue.append((author_id, depth + 1))

    return matched


def save_csv(matched, filename="matched_channels.csv"):
    with open(filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["Channel ID", "Channel Name", "Channel Link", "Match Type", "Matched Text", "Link"])
        for ch_id, info in matched.items():
            if info["matches"]:
                for m in info["matches"]:
                    writer.writerow([ch_id, info["name"], info["link"], m["type"], m["text"], m["link"]])
            else:
                # No matches, write nulls
                writer.writerow([ch_id, info["name"], info["link"], "", "", ""])


def save_html(matched, filename="matched_channels.html"):
    with open(filename, "w", encoding="utf-8") as f:
        f.write("<html><body><h1>YouTube Channel Crawl Results</h1>\n")

        # Channels with matches
        f.write("<h2>Matched Channels</h2>\n")
        for ch_id, info in matched.items():
            if not info["matches"]:
                continue
            f.write(f"<h3><a href='{info['link']}'>{info['name']}</a></h3>\n<ul>\n")
            for m in info["matches"]:
                text = m["text"][:200].replace("<", "&lt;").replace(">", "&gt;")
                f.write(f"<li>{m['type']}: <a href='{m['link']}'>{text}</a></li>\n")
            f.write("</ul>\n")

        # Channels with no matches
        no_match_channels = [info for info in matched.values() if not info["matches"]]
        if no_match_channels:
            f.write("<h2>Channels with no matches</h2>\n<ul>\n")
            for info in no_match_channels:
                f.write(f"<li><a href='{info['link']}'>{info['name']}</a></li>\n")
            f.write("</ul>\n")

        f.write("</body></html>")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="YouTube crawling bot using API")
    parser.add_argument("channel", help="Starting channel URL or ID")
    parser.add_argument("--api-key", required=True, help="YouTube Data API key")
    parser.add_argument("--max-depth", type=int, default=2)
    parser.add_argument("--max-videos", type=int, default=5)
    args = parser.parse_args()

    youtube = build("youtube", "v3", developerKey=args.api_key)
    start_id = get_channel_id(youtube, args.channel)

    results = crawl(youtube, start_id, max_depth=args.max_depth, max_videos=args.max_videos)

    # Sort channels by number of matches descending
    sorted_results = dict(sorted(results.items(), key=lambda kv: len(kv[1]["matches"]), reverse=True))

    print("\n=== Matched Channels ===")
    for ch_id, info in sorted_results.items():
        print(f"{info['link']} ({len(info['matches'])} matches)")

    # Interactive prompts for reports
    csv_save = input("\nDo you want to save a CSV report? [y/N]: ").strip().lower()
    if csv_save == "y":
        csv_filename = input("Enter CSV filename (default: matched_channels.csv): ").strip()
        if not csv_filename:
            csv_filename = "matched_channels.csv"
        save_csv(sorted_results, csv_filename)
        print(f"CSV saved to {csv_filename}")

    html_save = input("\nDo you want to save an HTML report? [y/N]: ").strip().lower()
    if html_save == "y":
        html_filename = input("Enter HTML filename (default: matched_channels.html): ").strip()
        if not html_filename:
            html_filename = "matched_channels.html"
        save_html(sorted_results, html_filename)
        print(f"HTML saved to {html_filename}")

    print("\n=== Done ===")