forked from phito/darknet_diaries_llm
Include episode title and number into data
This commit is contained in:
parent
f4a9c9bed7
commit
96c692aef7
@ -6,8 +6,14 @@ for i in range(1, 139):
|
||||
r = requests.get(url)
|
||||
soup = BeautifulSoup(r.text, 'html.parser')
|
||||
pre_section = soup.find('pre')
|
||||
title_section = soup.find('h1')
|
||||
|
||||
if pre_section:
|
||||
text = pre_section.get_text()
|
||||
title = title_section.get_text()
|
||||
with open(f"data/episode_{i}.txt", "w") as f:
|
||||
f.write(text)
|
||||
f.write(
|
||||
f"Darknet Diaries - {title}\n" +
|
||||
text
|
||||
)
|
||||
print(title)
|
||||
|
Loading…
Reference in New Issue
Block a user