Compare commits

..

6 Commits

Author SHA1 Message Date
Romain Quinet
fbcf0168f9 update readme 2023-10-07 09:38:32 +02:00
Romain Quinet
1a46ea4816 Improved chat mode 2023-10-07 08:48:08 +02:00
Romain Quinet
bf3fd878ac Use DnD API 2023-10-07 08:32:07 +02:00
Romain Quinet
c47ff3d9ed Added more metadata 2023-10-07 00:57:45 +02:00
Romain Quinet
a582d89c57 updated examples 2023-10-07 00:42:49 +02:00
Romain Quinet
d0dd93a5ab updated examples 2023-10-07 00:38:32 +02:00
3 changed files with 45 additions and 26 deletions

View File

@@ -2,8 +2,10 @@
## What is Darknet Diaries?
Well, let's ask our LLM:
> > What is Darknet Diaries?
> > Darknet Diaries is a podcast that explores true stories from the dark side of the internet. Hosted by Jack Rhysider, each episode delves into various topics related to hacking, cybercrime, and the underground world of the internet. The podcast features interviews with individuals who have been involved in hacking incidents, cyber espionage, or have experienced the consequences of online criminal activities. Darknet Diaries aims to educate and entertain listeners by providing an in-depth look into the fascinating and often mysterious world of the darknet.
> What is Darknet Diaries?
>> Darknet Diaries is a podcast that explores true stories from the dark side of the internet. Hosted by Jack Rhysider, each episode delves into various topics related to hacking, cybercrime, and the underground world of the internet. The podcast features interviews with individuals who have been involved in hacking incidents, cyber espionage, or have experienced the consequences of online criminal activities. Darknet Diaries aims to educate and entertain listeners by providing an in-depth look into the fascinating and often mysterious world of the darknet.
[darknetdiaries.com](http://darknetdiaries.com/)
## How to run
### Install dependencies
@@ -28,10 +30,14 @@ python3 main.py
On the first run, it will generate the index. This can take a while, but it will be cached on disk for the next runs.
You can then ask it any questions about Darknet Diaries! Currently, it does hallucinate a lot about episode numbers and titles. Other than that, it's pretty accurate!
You can then ask it any questions about Darknet Diaries!
## Examples
> What is the intro of the podcast?
>> Typically, the host Jack Rhysider starts the podcast with a brief introduction about the episode's topic, followed by "These are true stories from the dark side of the internet. I'm Jack Rhysider. This is Darknet Diaries." Please note that the exact wording may vary from episode to episode.
(this is exactly the intro)
### Episode 137: Predator
[link to transcript](https://darknetdiaries.com/transcript/137/)
@@ -61,3 +67,6 @@ You can then ask it any questions about Darknet Diaries! Currently, it does hall
>>One of his memorable stories involves a physical penetration test where he had to break into a former employer's building. He used his knowledge of the building's layout and security mechanisms to gain access, even falling through a ceiling into a server room at one point.
>>
>>The episode also covers a project where Jason was tasked with hacking into a large, worldwide bank. His job was to examine the bank's mobile app for any potential security vulnerabilities that could expose customer or sensitive information. The episode provides a detailed look into the world of penetration testing, highlighting the importance of robust security measures in both physical and digital spaces.
>
> How many downloads does this episode have?
>> Episode 130 of Darknet Diaries, titled "JASON'S PEN TEST", has 667,528 downloads.

View File

@@ -1,6 +1,7 @@
import requests
import os
from bs4 import BeautifulSoup
import json
folder_path = "transcripts"
@@ -8,17 +9,22 @@ if not os.path.exists(folder_path):
os.makedirs(folder_path)
for i in range(1, 139):
url = f"https://darknetdiaries.com/transcript/{i}"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
pre_section = soup.find('pre')
title_section = soup.find('h1')
try:
url = f"https://darknetdiaries.com/transcript/{i}"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
if pre_section:
transcript = pre_section.get_text()
ep, title = title_section.get_text().split(":", 1)
ep = ep.strip()
title = title.strip()
with open(f"{folder_path}/episode_{i}.txt", "w") as f:
f.write(f"{title}\n{transcript}")
print(f"{ep} {title}")
transcript = soup.find('pre').get_text()
url = f"https://api.darknetdiaries.com/{i}.json"
r = requests.get(url)
parsed_json = json.loads(r.text)
title = parsed_json["episode_name"]
number = parsed_json["episode_number"]
downloads = parsed_json["total_downloads"]
with open(f"{folder_path}/episode_{number}.txt", "w") as f:
f.write(f"{title}\n{downloads}\n{transcript}")
print(f"{number} {title}")
except Exception:
print(f"Failed scraping episode {i}")

20
main.py
View File

@@ -1,12 +1,16 @@
from llama_index import (SimpleDirectoryReader, ServiceContext, StorageContext,
from llama_index import (ServiceContext, StorageContext,
load_index_from_storage, Document, set_global_service_context)
from llama_index.node_parser import SimpleNodeParser
from llama_index import VectorStoreIndex
from llama_index.llms import OpenAI, ChatMessage, MessageRole
from llama_index.prompts import ChatPromptTemplate
from llama_index import set_global_handler
from llama_index.chat_engine.types import ChatMode
import os
import re
# set_global_handler("simple")
llm = OpenAI(model="gpt-4", temperature=0, max_tokens=256)
service_context = ServiceContext.from_defaults(llm=llm)
set_global_service_context(service_context)
@@ -17,13 +21,16 @@ if not os.path.exists("./index/lock"):
episode_number = re.search(r'\d+', filename).group()
with open("./transcripts/" + filename, 'r') as f:
title = f.readline().strip()
downloads = f.readline().strip()
content = f.read()
document = Document(
text=content,
doc_id=filename,
metadata={
"episode_number": episode_number,
"episode_title": title
"episode_title": title,
"episode_downloads": downloads,
"episode_url": f"https://darknetdiaries.com/episode/{episode_number}/"
}
)
documents.append(document)
@@ -87,16 +94,13 @@ refine_template = ChatPromptTemplate(chat_refine_msgs)
chat_engine = index.as_chat_engine(
text_qa_template=text_qa_template,
refine_template=refine_template
refine_template=refine_template,
chat_mode=ChatMode.OPENAI
)
while True:
try:
user_prompt = input("Prompt: ")
streaming_response = chat_engine.stream_chat(user_prompt)
for token in streaming_response.response_gen:
print(token, end="")
print("\n")
chat_engine.chat_repl()
except KeyboardInterrupt:
break