forked from phito/darknet_diaries_llm
document metadata
This commit is contained in:
parent
395de571a4
commit
4e587aed9e
2
.gitignore
vendored
2
.gitignore
vendored
@ -1,3 +1,3 @@
|
|||||||
/data
|
/transcripts
|
||||||
/index
|
/index
|
||||||
/.idea
|
/.idea
|
@ -1,6 +1,12 @@
|
|||||||
import requests
|
import requests
|
||||||
|
import os
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
folder_path = "transcripts"
|
||||||
|
|
||||||
|
if not os.path.exists(folder_path):
|
||||||
|
os.makedirs(folder_path)
|
||||||
|
|
||||||
for i in range(1, 139):
|
for i in range(1, 139):
|
||||||
url = f"https://darknetdiaries.com/transcript/{i}"
|
url = f"https://darknetdiaries.com/transcript/{i}"
|
||||||
r = requests.get(url)
|
r = requests.get(url)
|
||||||
@ -9,11 +15,10 @@ for i in range(1, 139):
|
|||||||
title_section = soup.find('h1')
|
title_section = soup.find('h1')
|
||||||
|
|
||||||
if pre_section:
|
if pre_section:
|
||||||
text = pre_section.get_text()
|
transcript = pre_section.get_text()
|
||||||
title = title_section.get_text()
|
ep, title = title_section.get_text().split(":", 1)
|
||||||
with open(f"data/episode_{i}.txt", "w") as f:
|
ep = ep.strip()
|
||||||
f.write(
|
title = title.strip()
|
||||||
f"Darknet Diaries - {title}\n" +
|
with open(f"{folder_path}/episode_{i}.txt", "w") as f:
|
||||||
text
|
f.write(f"{title}\n{transcript}")
|
||||||
)
|
print(f"{ep} {title}")
|
||||||
print(title)
|
|
||||||
|
12
main.py
12
main.py
@ -12,20 +12,22 @@ service_context = ServiceContext.from_defaults(llm=llm)
|
|||||||
set_global_service_context(service_context)
|
set_global_service_context(service_context)
|
||||||
|
|
||||||
if not os.path.exists("./index/lock"):
|
if not os.path.exists("./index/lock"):
|
||||||
print("Generating index...")
|
|
||||||
documents = []
|
documents = []
|
||||||
for filename in os.listdir("./data"):
|
for filename in os.listdir("./transcripts"):
|
||||||
episode_number = re.search(r'\d+', filename).group()
|
episode_number = re.search(r'\d+', filename).group()
|
||||||
with open("./data/" + filename, 'r') as f:
|
with open("./transcripts/" + filename, 'r') as f:
|
||||||
|
title = f.readline().strip()
|
||||||
content = f.read()
|
content = f.read()
|
||||||
document = Document(
|
document = Document(
|
||||||
text=content,
|
text=content,
|
||||||
|
doc_id=filename,
|
||||||
metadata={
|
metadata={
|
||||||
"episode_number": episode_number
|
"episode_number": episode_number,
|
||||||
|
"episode_title": title
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
documents.append(document)
|
||||||
|
|
||||||
documents = SimpleDirectoryReader('./data').load_data()
|
|
||||||
parser = SimpleNodeParser.from_defaults()
|
parser = SimpleNodeParser.from_defaults()
|
||||||
nodes = parser.get_nodes_from_documents(documents)
|
nodes = parser.get_nodes_from_documents(documents)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user