document metadata

This commit is contained in:
Romain Quinet 2023-10-07 00:30:46 +02:00
parent 395de571a4
commit 4e587aed9e
3 changed files with 21 additions and 14 deletions

2
.gitignore vendored
View File

@ -1,3 +1,3 @@
/data
/transcripts
/index
/.idea

View File

@ -1,6 +1,12 @@
import requests
import os
from bs4 import BeautifulSoup
folder_path = "transcripts"
if not os.path.exists(folder_path):
os.makedirs(folder_path)
for i in range(1, 139):
url = f"https://darknetdiaries.com/transcript/{i}"
r = requests.get(url)
@ -9,11 +15,10 @@ for i in range(1, 139):
title_section = soup.find('h1')
if pre_section:
text = pre_section.get_text()
title = title_section.get_text()
with open(f"data/episode_{i}.txt", "w") as f:
f.write(
f"Darknet Diaries - {title}\n" +
text
)
print(title)
transcript = pre_section.get_text()
ep, title = title_section.get_text().split(":", 1)
ep = ep.strip()
title = title.strip()
with open(f"{folder_path}/episode_{i}.txt", "w") as f:
f.write(f"{title}\n{transcript}")
print(f"{ep} {title}")

12
main.py
View File

@ -12,20 +12,22 @@ service_context = ServiceContext.from_defaults(llm=llm)
set_global_service_context(service_context)
if not os.path.exists("./index/lock"):
print("Generating index...")
documents = []
for filename in os.listdir("./data"):
for filename in os.listdir("./transcripts"):
episode_number = re.search(r'\d+', filename).group()
with open("./data/" + filename, 'r') as f:
with open("./transcripts/" + filename, 'r') as f:
title = f.readline().strip()
content = f.read()
document = Document(
text=content,
doc_id=filename,
metadata={
"episode_number": episode_number
"episode_number": episode_number,
"episode_title": title
}
)
documents.append(document)
documents = SimpleDirectoryReader('./data').load_data()
parser = SimpleNodeParser.from_defaults()
nodes = parser.get_nodes_from_documents(documents)