From c47ff3d9ed66e91050755d493d9edeb3c3ea056d Mon Sep 17 00:00:00 2001 From: Romain Quinet Date: Sat, 7 Oct 2023 00:57:45 +0200 Subject: [PATCH] Added more metadata --- download_transcripts.py | 24 ++++++++++++++---------- main.py | 7 +++++-- 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/download_transcripts.py b/download_transcripts.py index 1f60415..aac4e17 100644 --- a/download_transcripts.py +++ b/download_transcripts.py @@ -11,14 +11,18 @@ for i in range(1, 139): url = f"https://darknetdiaries.com/transcript/{i}" r = requests.get(url) soup = BeautifulSoup(r.text, 'html.parser') - pre_section = soup.find('pre') - title_section = soup.find('h1') - if pre_section: - transcript = pre_section.get_text() - ep, title = title_section.get_text().split(":", 1) - ep = ep.strip() - title = title.strip() - with open(f"{folder_path}/episode_{i}.txt", "w") as f: - f.write(f"{title}\n{transcript}") - print(f"{ep} {title}") + transcript = soup.find('pre').get_text() + title_section = soup.find('h1').get_text() + + url = f"https://darknetdiaries.com/episode/{i}" + r = requests.get(url) + soup = BeautifulSoup(r.text, 'html.parser') + downloads = soup.find(id='downloads').get_text() + + ep, title = title_section.split(":", 1) + ep = ep.strip() + title = title.strip() + with open(f"{folder_path}/episode_{i}.txt", "w") as f: + f.write(f"{title}\n{downloads}\n{transcript}") + print(f"{ep} {title}") diff --git a/main.py b/main.py index d37b9d7..4368820 100644 --- a/main.py +++ b/main.py @@ -1,4 +1,4 @@ -from llama_index import (SimpleDirectoryReader, ServiceContext, StorageContext, +from llama_index import (ServiceContext, StorageContext, load_index_from_storage, Document, set_global_service_context) from llama_index.node_parser import SimpleNodeParser from llama_index import VectorStoreIndex @@ -17,13 +17,16 @@ if not os.path.exists("./index/lock"): episode_number = re.search(r'\d+', filename).group() with open("./transcripts/" + filename, 'r') as f: title = f.readline().strip() + downloads = f.readline().strip() content = f.read() document = Document( text=content, doc_id=filename, metadata={ "episode_number": episode_number, - "episode_title": title + "episode_title": title, + "episode_downloads": downloads, + "episode_url": f"https://darknetdiaries.com/episode/{episode_number}/" } ) documents.append(document)