import asyncio import aiohttp import json CONCURRENCY = 50 LIMIT = 0 SKIP_HIDDEN = False FORUM = "981190-wild-west-saga" async def get_url(session, url): async with session.get(url) as response: #print("getting", response.url) return await response.text() async def main(): async with aiohttp.ClientSession("https://www.kongregate.com") as session: threads = {} increment = 1 stopped = False while not stopped: tasks = [] start_page = increment end_page = increment + CONCURRENCY increment += CONCURRENCY if LIMIT and end_page >= LIMIT: break tasks.extend(asyncio.ensure_future( get_url(session, f"/forums/{FORUM}/topics/x/posts.json?page={page}")) #get_url(session, f"/posts.json?page={page}")) for page in range(start_page, end_page)) responses = await asyncio.gather(*tasks) print(f'Scraping pages {start_page} to {end_page}') for data in responses: data_json = json.loads(data) if not data_json["posts"]: stopped = True break usernames = { user["id"]: user["username"] for user in data_json["users"] } forums = { forum["id"]: { "name": forum["name"], "description": forum["description"] } for forum in data_json["forums"] } threads.update({ topic["id"]: { "forum": { "forum_name": forums[topic["forum_id"]]["name"], "forum_description": forums[topic["forum_id"]]["description"] }, "title": topic["title"], "posts": [] } for topic in data_json["topics"] if topic["id"] not in threads.keys() }) for post in data_json["posts"]: if SKIP_HIDDEN and post["hidden"]: continue threads[post["topic_id"]]["posts"].insert(0, { "post_id": post["id"], "username": usernames[post["user_id"]], "user_id": post["user_id"], "time": post["posted_at"], "body": post["body"].encode("ascii", "ignore").decode(), "hidden": post["hidden"], }) return threads if __name__ == '__main__': threads = asyncio.run(main()) print(f"Scraped {len(threads)} threads.") with open(f"{FORUM}.json", "w") as file: file.write(json.dumps(threads, indent=4))