import requests from bs4 import BeautifulSoup import re import json import logging from urllib.parse import urljoin, urlparse # Konfiguration CONFIG = { 'source_url': 'https://www.tvbalkanuzivo.com/', 'output_file': 'channels_config.py', 'container_selector': 'div.col-6.col-md-3.col-xl-2', # Angepasst nach der bereitgestellten Struktur 'link_selector': 'a', # Angepasst nach der bereitgestellten Struktur 'base_url': 'https://www.tvbalkanuzivo.com' } # Logger einrichten logging.basicConfig( level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler('channel_scraper.log'), logging.StreamHandler() ] ) logger = logging.getLogger(__name__) def fetch_page(url): """Läd die Webseite herunter""" try: response = requests.get(url, timeout=10) response.raise_for_status() return response.text except Exception as e: logger.error(f"Fehler beim Laden der Seite: {str(e)}") return None def extract_channels(html): """Extrahiert Channel-Links aus der HTML-Struktur""" soup = BeautifulSoup(html, 'html.parser') channel_containers = soup.select(CONFIG['container_selector']) if not channel_containers: logger.error("Channel-Container nicht gefunden!") return [] channels = [] for container in channel_containers: try: link = container.select_one(CONFIG['link_selector']) if not link or not link.has_attr('href'): logger.debug(f"Link nicht gefunden oder kein href-Attribut in Container: {container}") continue relative_url = link['href'] full_url = urljoin(CONFIG['base_url'], relative_url) # Extrahiere den Titel als Namen, falls vorhanden channel_name = link.get('title', 'Unbekannter Kanal') if not channel_name or channel_name == "Unbekannter Kanal": # Falls kein Titel vorhanden ist, verwende den letzten Teil der URL parsed_url = urlparse(full_url) path_parts = parsed_url.path.strip('/').split('/') channel_name = path_parts[-1] if path_parts else "Unbekannter Kanal" clean_name = channel_name.replace('-', ' ').title() short_name = clean_name.split()[0].title() channels.append({ 'name': f"{clean_name}", 'url': full_url, 'output_name': short_name }) logger.debug(f"Gefundener Channel: {clean_name} -> {full_url}") except Exception as e: logger.warning(f"Fehler bei Link-Verarbeitung: {str(e)}", exc_info=True) continue return channels def save_config(channels): """Speichert die Channel-Liste im Python-Format""" with open(CONFIG['output_file'], 'w') as f: f.write("STREAM_SOURCES = [\n") for channel in channels: f.write(f" {{\n") f.write(f" 'name': {json.dumps(channel['name'])},\n") f.write(f" 'url': {json.dumps(channel['url'])},\n") f.write(f" 'output_name': {json.dumps(channel['output_name'])},\n") f.write(f" }},\n") f.write("]\n") def main(): logger.info(f"Starte Scraping von {CONFIG['source_url']}") html = fetch_page(CONFIG['source_url']) if not html: logger.error("Scraping abgebrochen") return channels = extract_channels(html) if not channels: logger.error("Keine Channels gefunden") return logger.info(f"Gefundene Channels: {len(channels)}") save_config(channels) logger.info(f"Konfiguration gespeichert in {CONFIG['output_file']}") if __name__ == "__main__": main()