Added more metadata
This commit is contained in:
parent
a582d89c57
commit
c47ff3d9ed
@ -11,14 +11,18 @@ for i in range(1, 139):
|
|||||||
url = f"https://darknetdiaries.com/transcript/{i}"
|
url = f"https://darknetdiaries.com/transcript/{i}"
|
||||||
r = requests.get(url)
|
r = requests.get(url)
|
||||||
soup = BeautifulSoup(r.text, 'html.parser')
|
soup = BeautifulSoup(r.text, 'html.parser')
|
||||||
pre_section = soup.find('pre')
|
|
||||||
title_section = soup.find('h1')
|
|
||||||
|
|
||||||
if pre_section:
|
transcript = soup.find('pre').get_text()
|
||||||
transcript = pre_section.get_text()
|
title_section = soup.find('h1').get_text()
|
||||||
ep, title = title_section.get_text().split(":", 1)
|
|
||||||
ep = ep.strip()
|
url = f"https://darknetdiaries.com/episode/{i}"
|
||||||
title = title.strip()
|
r = requests.get(url)
|
||||||
with open(f"{folder_path}/episode_{i}.txt", "w") as f:
|
soup = BeautifulSoup(r.text, 'html.parser')
|
||||||
f.write(f"{title}\n{transcript}")
|
downloads = soup.find(id='downloads').get_text()
|
||||||
print(f"{ep} {title}")
|
|
||||||
|
ep, title = title_section.split(":", 1)
|
||||||
|
ep = ep.strip()
|
||||||
|
title = title.strip()
|
||||||
|
with open(f"{folder_path}/episode_{i}.txt", "w") as f:
|
||||||
|
f.write(f"{title}\n{downloads}\n{transcript}")
|
||||||
|
print(f"{ep} {title}")
|
||||||
|
7
main.py
7
main.py
@ -1,4 +1,4 @@
|
|||||||
from llama_index import (SimpleDirectoryReader, ServiceContext, StorageContext,
|
from llama_index import (ServiceContext, StorageContext,
|
||||||
load_index_from_storage, Document, set_global_service_context)
|
load_index_from_storage, Document, set_global_service_context)
|
||||||
from llama_index.node_parser import SimpleNodeParser
|
from llama_index.node_parser import SimpleNodeParser
|
||||||
from llama_index import VectorStoreIndex
|
from llama_index import VectorStoreIndex
|
||||||
@ -17,13 +17,16 @@ if not os.path.exists("./index/lock"):
|
|||||||
episode_number = re.search(r'\d+', filename).group()
|
episode_number = re.search(r'\d+', filename).group()
|
||||||
with open("./transcripts/" + filename, 'r') as f:
|
with open("./transcripts/" + filename, 'r') as f:
|
||||||
title = f.readline().strip()
|
title = f.readline().strip()
|
||||||
|
downloads = f.readline().strip()
|
||||||
content = f.read()
|
content = f.read()
|
||||||
document = Document(
|
document = Document(
|
||||||
text=content,
|
text=content,
|
||||||
doc_id=filename,
|
doc_id=filename,
|
||||||
metadata={
|
metadata={
|
||||||
"episode_number": episode_number,
|
"episode_number": episode_number,
|
||||||
"episode_title": title
|
"episode_title": title,
|
||||||
|
"episode_downloads": downloads,
|
||||||
|
"episode_url": f"https://darknetdiaries.com/episode/{episode_number}/"
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
documents.append(document)
|
documents.append(document)
|
||||||
|
Loading…
Reference in New Issue
Block a user