forked from phito/darknet_diaries_llm
document metadata
This commit is contained in:
parent
395de571a4
commit
4e587aed9e
2
.gitignore
vendored
2
.gitignore
vendored
@ -1,3 +1,3 @@
|
||||
/data
|
||||
/transcripts
|
||||
/index
|
||||
/.idea
|
@ -1,6 +1,12 @@
|
||||
import requests
|
||||
import os
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
folder_path = "transcripts"
|
||||
|
||||
if not os.path.exists(folder_path):
|
||||
os.makedirs(folder_path)
|
||||
|
||||
for i in range(1, 139):
|
||||
url = f"https://darknetdiaries.com/transcript/{i}"
|
||||
r = requests.get(url)
|
||||
@ -9,11 +15,10 @@ for i in range(1, 139):
|
||||
title_section = soup.find('h1')
|
||||
|
||||
if pre_section:
|
||||
text = pre_section.get_text()
|
||||
title = title_section.get_text()
|
||||
with open(f"data/episode_{i}.txt", "w") as f:
|
||||
f.write(
|
||||
f"Darknet Diaries - {title}\n" +
|
||||
text
|
||||
)
|
||||
print(title)
|
||||
transcript = pre_section.get_text()
|
||||
ep, title = title_section.get_text().split(":", 1)
|
||||
ep = ep.strip()
|
||||
title = title.strip()
|
||||
with open(f"{folder_path}/episode_{i}.txt", "w") as f:
|
||||
f.write(f"{title}\n{transcript}")
|
||||
print(f"{ep} {title}")
|
||||
|
12
main.py
12
main.py
@ -12,20 +12,22 @@ service_context = ServiceContext.from_defaults(llm=llm)
|
||||
set_global_service_context(service_context)
|
||||
|
||||
if not os.path.exists("./index/lock"):
|
||||
print("Generating index...")
|
||||
documents = []
|
||||
for filename in os.listdir("./data"):
|
||||
for filename in os.listdir("./transcripts"):
|
||||
episode_number = re.search(r'\d+', filename).group()
|
||||
with open("./data/" + filename, 'r') as f:
|
||||
with open("./transcripts/" + filename, 'r') as f:
|
||||
title = f.readline().strip()
|
||||
content = f.read()
|
||||
document = Document(
|
||||
text=content,
|
||||
doc_id=filename,
|
||||
metadata={
|
||||
"episode_number": episode_number
|
||||
"episode_number": episode_number,
|
||||
"episode_title": title
|
||||
}
|
||||
)
|
||||
documents.append(document)
|
||||
|
||||
documents = SimpleDirectoryReader('./data').load_data()
|
||||
parser = SimpleNodeParser.from_defaults()
|
||||
nodes = parser.get_nodes_from_documents(documents)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user