document metadata

This commit is contained in:
Romain Quinet 2023-10-07 00:30:46 +02:00
parent 395de571a4
commit 4e587aed9e
3 changed files with 21 additions and 14 deletions

2
.gitignore vendored
View File

@ -1,3 +1,3 @@
/data /transcripts
/index /index
/.idea /.idea

View File

@ -1,6 +1,12 @@
import requests import requests
import os
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
folder_path = "transcripts"
if not os.path.exists(folder_path):
os.makedirs(folder_path)
for i in range(1, 139): for i in range(1, 139):
url = f"https://darknetdiaries.com/transcript/{i}" url = f"https://darknetdiaries.com/transcript/{i}"
r = requests.get(url) r = requests.get(url)
@ -9,11 +15,10 @@ for i in range(1, 139):
title_section = soup.find('h1') title_section = soup.find('h1')
if pre_section: if pre_section:
text = pre_section.get_text() transcript = pre_section.get_text()
title = title_section.get_text() ep, title = title_section.get_text().split(":", 1)
with open(f"data/episode_{i}.txt", "w") as f: ep = ep.strip()
f.write( title = title.strip()
f"Darknet Diaries - {title}\n" + with open(f"{folder_path}/episode_{i}.txt", "w") as f:
text f.write(f"{title}\n{transcript}")
) print(f"{ep} {title}")
print(title)

12
main.py
View File

@ -12,20 +12,22 @@ service_context = ServiceContext.from_defaults(llm=llm)
set_global_service_context(service_context) set_global_service_context(service_context)
if not os.path.exists("./index/lock"): if not os.path.exists("./index/lock"):
print("Generating index...")
documents = [] documents = []
for filename in os.listdir("./data"): for filename in os.listdir("./transcripts"):
episode_number = re.search(r'\d+', filename).group() episode_number = re.search(r'\d+', filename).group()
with open("./data/" + filename, 'r') as f: with open("./transcripts/" + filename, 'r') as f:
title = f.readline().strip()
content = f.read() content = f.read()
document = Document( document = Document(
text=content, text=content,
doc_id=filename,
metadata={ metadata={
"episode_number": episode_number "episode_number": episode_number,
"episode_title": title
} }
) )
documents.append(document)
documents = SimpleDirectoryReader('./data').load_data()
parser = SimpleNodeParser.from_defaults() parser = SimpleNodeParser.from_defaults()
nodes = parser.get_nodes_from_documents(documents) nodes = parser.get_nodes_from_documents(documents)