forked from phito/darknet_diaries_llm
Include episode title and number into data
This commit is contained in:
parent
f4a9c9bed7
commit
96c692aef7
@ -6,8 +6,14 @@ for i in range(1, 139):
|
|||||||
r = requests.get(url)
|
r = requests.get(url)
|
||||||
soup = BeautifulSoup(r.text, 'html.parser')
|
soup = BeautifulSoup(r.text, 'html.parser')
|
||||||
pre_section = soup.find('pre')
|
pre_section = soup.find('pre')
|
||||||
|
title_section = soup.find('h1')
|
||||||
|
|
||||||
if pre_section:
|
if pre_section:
|
||||||
text = pre_section.get_text()
|
text = pre_section.get_text()
|
||||||
|
title = title_section.get_text()
|
||||||
with open(f"data/episode_{i}.txt", "w") as f:
|
with open(f"data/episode_{i}.txt", "w") as f:
|
||||||
f.write(text)
|
f.write(
|
||||||
|
f"Darknet Diaries - {title}\n" +
|
||||||
|
text
|
||||||
|
)
|
||||||
|
print(title)
|
||||||
|
Loading…
Reference in New Issue
Block a user