Added more metadata

This commit is contained in:
Romain Quinet 2023-10-07 00:57:45 +02:00
parent a582d89c57
commit c47ff3d9ed
2 changed files with 19 additions and 12 deletions

View File

@ -11,14 +11,18 @@ for i in range(1, 139):
url = f"https://darknetdiaries.com/transcript/{i}" url = f"https://darknetdiaries.com/transcript/{i}"
r = requests.get(url) r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser') soup = BeautifulSoup(r.text, 'html.parser')
pre_section = soup.find('pre')
title_section = soup.find('h1')
if pre_section: transcript = soup.find('pre').get_text()
transcript = pre_section.get_text() title_section = soup.find('h1').get_text()
ep, title = title_section.get_text().split(":", 1)
url = f"https://darknetdiaries.com/episode/{i}"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
downloads = soup.find(id='downloads').get_text()
ep, title = title_section.split(":", 1)
ep = ep.strip() ep = ep.strip()
title = title.strip() title = title.strip()
with open(f"{folder_path}/episode_{i}.txt", "w") as f: with open(f"{folder_path}/episode_{i}.txt", "w") as f:
f.write(f"{title}\n{transcript}") f.write(f"{title}\n{downloads}\n{transcript}")
print(f"{ep} {title}") print(f"{ep} {title}")

View File

@ -1,4 +1,4 @@
from llama_index import (SimpleDirectoryReader, ServiceContext, StorageContext, from llama_index import (ServiceContext, StorageContext,
load_index_from_storage, Document, set_global_service_context) load_index_from_storage, Document, set_global_service_context)
from llama_index.node_parser import SimpleNodeParser from llama_index.node_parser import SimpleNodeParser
from llama_index import VectorStoreIndex from llama_index import VectorStoreIndex
@ -17,13 +17,16 @@ if not os.path.exists("./index/lock"):
episode_number = re.search(r'\d+', filename).group() episode_number = re.search(r'\d+', filename).group()
with open("./transcripts/" + filename, 'r') as f: with open("./transcripts/" + filename, 'r') as f:
title = f.readline().strip() title = f.readline().strip()
downloads = f.readline().strip()
content = f.read() content = f.read()
document = Document( document = Document(
text=content, text=content,
doc_id=filename, doc_id=filename,
metadata={ metadata={
"episode_number": episode_number, "episode_number": episode_number,
"episode_title": title "episode_title": title,
"episode_downloads": downloads,
"episode_url": f"https://darknetdiaries.com/episode/{episode_number}/"
} }
) )
documents.append(document) documents.append(document)