forked from phito/darknet_diaries_llm
Added more metadata
This commit is contained in:
parent
a582d89c57
commit
c47ff3d9ed
@ -11,14 +11,18 @@ for i in range(1, 139):
|
||||
url = f"https://darknetdiaries.com/transcript/{i}"
|
||||
r = requests.get(url)
|
||||
soup = BeautifulSoup(r.text, 'html.parser')
|
||||
pre_section = soup.find('pre')
|
||||
title_section = soup.find('h1')
|
||||
|
||||
if pre_section:
|
||||
transcript = pre_section.get_text()
|
||||
ep, title = title_section.get_text().split(":", 1)
|
||||
ep = ep.strip()
|
||||
title = title.strip()
|
||||
with open(f"{folder_path}/episode_{i}.txt", "w") as f:
|
||||
f.write(f"{title}\n{transcript}")
|
||||
print(f"{ep} {title}")
|
||||
transcript = soup.find('pre').get_text()
|
||||
title_section = soup.find('h1').get_text()
|
||||
|
||||
url = f"https://darknetdiaries.com/episode/{i}"
|
||||
r = requests.get(url)
|
||||
soup = BeautifulSoup(r.text, 'html.parser')
|
||||
downloads = soup.find(id='downloads').get_text()
|
||||
|
||||
ep, title = title_section.split(":", 1)
|
||||
ep = ep.strip()
|
||||
title = title.strip()
|
||||
with open(f"{folder_path}/episode_{i}.txt", "w") as f:
|
||||
f.write(f"{title}\n{downloads}\n{transcript}")
|
||||
print(f"{ep} {title}")
|
||||
|
7
main.py
7
main.py
@ -1,4 +1,4 @@
|
||||
from llama_index import (SimpleDirectoryReader, ServiceContext, StorageContext,
|
||||
from llama_index import (ServiceContext, StorageContext,
|
||||
load_index_from_storage, Document, set_global_service_context)
|
||||
from llama_index.node_parser import SimpleNodeParser
|
||||
from llama_index import VectorStoreIndex
|
||||
@ -17,13 +17,16 @@ if not os.path.exists("./index/lock"):
|
||||
episode_number = re.search(r'\d+', filename).group()
|
||||
with open("./transcripts/" + filename, 'r') as f:
|
||||
title = f.readline().strip()
|
||||
downloads = f.readline().strip()
|
||||
content = f.read()
|
||||
document = Document(
|
||||
text=content,
|
||||
doc_id=filename,
|
||||
metadata={
|
||||
"episode_number": episode_number,
|
||||
"episode_title": title
|
||||
"episode_title": title,
|
||||
"episode_downloads": downloads,
|
||||
"episode_url": f"https://darknetdiaries.com/episode/{episode_number}/"
|
||||
}
|
||||
)
|
||||
documents.append(document)
|
||||
|
Loading…
Reference in New Issue
Block a user