Compare commits

..

1 Commits

Author SHA1 Message Date
Romain Quinet
fdf987c342 updated examples 2023-10-07 00:36:03 +02:00
3 changed files with 26 additions and 45 deletions

View File

@@ -2,10 +2,8 @@
## What is Darknet Diaries? ## What is Darknet Diaries?
Well, let's ask our LLM: Well, let's ask our LLM:
> What is Darknet Diaries? > > What is Darknet Diaries?
>> Darknet Diaries is a podcast that explores true stories from the dark side of the internet. Hosted by Jack Rhysider, each episode delves into various topics related to hacking, cybercrime, and the underground world of the internet. The podcast features interviews with individuals who have been involved in hacking incidents, cyber espionage, or have experienced the consequences of online criminal activities. Darknet Diaries aims to educate and entertain listeners by providing an in-depth look into the fascinating and often mysterious world of the darknet. > > Darknet Diaries is a podcast that explores true stories from the dark side of the internet. Hosted by Jack Rhysider, each episode delves into various topics related to hacking, cybercrime, and the underground world of the internet. The podcast features interviews with individuals who have been involved in hacking incidents, cyber espionage, or have experienced the consequences of online criminal activities. Darknet Diaries aims to educate and entertain listeners by providing an in-depth look into the fascinating and often mysterious world of the darknet.
[darknetdiaries.com](http://darknetdiaries.com/)
## How to run ## How to run
### Install dependencies ### Install dependencies
@@ -30,14 +28,10 @@ python3 main.py
On the first run, it will generate the index. This can take a while, but it will be cached on disk for the next runs. On the first run, it will generate the index. This can take a while, but it will be cached on disk for the next runs.
You can then ask it any questions about Darknet Diaries! You can then ask it any questions about Darknet Diaries! Currently, it does hallucinate a lot about episode numbers and titles. Other than that, it's pretty accurate!
## Examples ## Examples
> What is the intro of the podcast?
>> Typically, the host Jack Rhysider starts the podcast with a brief introduction about the episode's topic, followed by "These are true stories from the dark side of the internet. I'm Jack Rhysider. This is Darknet Diaries." Please note that the exact wording may vary from episode to episode.
(this is exactly the intro)
### Episode 137: Predator ### Episode 137: Predator
[link to transcript](https://darknetdiaries.com/transcript/137/) [link to transcript](https://darknetdiaries.com/transcript/137/)
@@ -67,6 +61,3 @@ You can then ask it any questions about Darknet Diaries!
>>One of his memorable stories involves a physical penetration test where he had to break into a former employer's building. He used his knowledge of the building's layout and security mechanisms to gain access, even falling through a ceiling into a server room at one point. >>One of his memorable stories involves a physical penetration test where he had to break into a former employer's building. He used his knowledge of the building's layout and security mechanisms to gain access, even falling through a ceiling into a server room at one point.
>> >>
>>The episode also covers a project where Jason was tasked with hacking into a large, worldwide bank. His job was to examine the bank's mobile app for any potential security vulnerabilities that could expose customer or sensitive information. The episode provides a detailed look into the world of penetration testing, highlighting the importance of robust security measures in both physical and digital spaces. >>The episode also covers a project where Jason was tasked with hacking into a large, worldwide bank. His job was to examine the bank's mobile app for any potential security vulnerabilities that could expose customer or sensitive information. The episode provides a detailed look into the world of penetration testing, highlighting the importance of robust security measures in both physical and digital spaces.
>
> How many downloads does this episode have?
>> Episode 130 of Darknet Diaries, titled "JASON'S PEN TEST", has 667,528 downloads.

View File

@@ -1,7 +1,6 @@
import requests import requests
import os import os
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import json
folder_path = "transcripts" folder_path = "transcripts"
@@ -9,22 +8,17 @@ if not os.path.exists(folder_path):
os.makedirs(folder_path) os.makedirs(folder_path)
for i in range(1, 139): for i in range(1, 139):
try: url = f"https://darknetdiaries.com/transcript/{i}"
url = f"https://darknetdiaries.com/transcript/{i}" r = requests.get(url)
r = requests.get(url) soup = BeautifulSoup(r.text, 'html.parser')
soup = BeautifulSoup(r.text, 'html.parser') pre_section = soup.find('pre')
title_section = soup.find('h1')
transcript = soup.find('pre').get_text() if pre_section:
transcript = pre_section.get_text()
url = f"https://api.darknetdiaries.com/{i}.json" ep, title = title_section.get_text().split(":", 1)
r = requests.get(url) ep = ep.strip()
parsed_json = json.loads(r.text) title = title.strip()
title = parsed_json["episode_name"] with open(f"{folder_path}/episode_{i}.txt", "w") as f:
number = parsed_json["episode_number"] f.write(f"{title}\n{transcript}")
downloads = parsed_json["total_downloads"] print(f"{ep} {title}")
with open(f"{folder_path}/episode_{number}.txt", "w") as f:
f.write(f"{title}\n{downloads}\n{transcript}")
print(f"{number} {title}")
except Exception:
print(f"Failed scraping episode {i}")

20
main.py
View File

@@ -1,16 +1,12 @@
from llama_index import (ServiceContext, StorageContext, from llama_index import (SimpleDirectoryReader, ServiceContext, StorageContext,
load_index_from_storage, Document, set_global_service_context) load_index_from_storage, Document, set_global_service_context)
from llama_index.node_parser import SimpleNodeParser from llama_index.node_parser import SimpleNodeParser
from llama_index import VectorStoreIndex from llama_index import VectorStoreIndex
from llama_index.llms import OpenAI, ChatMessage, MessageRole from llama_index.llms import OpenAI, ChatMessage, MessageRole
from llama_index.prompts import ChatPromptTemplate from llama_index.prompts import ChatPromptTemplate
from llama_index import set_global_handler
from llama_index.chat_engine.types import ChatMode
import os import os
import re import re
# set_global_handler("simple")
llm = OpenAI(model="gpt-4", temperature=0, max_tokens=256) llm = OpenAI(model="gpt-4", temperature=0, max_tokens=256)
service_context = ServiceContext.from_defaults(llm=llm) service_context = ServiceContext.from_defaults(llm=llm)
set_global_service_context(service_context) set_global_service_context(service_context)
@@ -21,16 +17,13 @@ if not os.path.exists("./index/lock"):
episode_number = re.search(r'\d+', filename).group() episode_number = re.search(r'\d+', filename).group()
with open("./transcripts/" + filename, 'r') as f: with open("./transcripts/" + filename, 'r') as f:
title = f.readline().strip() title = f.readline().strip()
downloads = f.readline().strip()
content = f.read() content = f.read()
document = Document( document = Document(
text=content, text=content,
doc_id=filename, doc_id=filename,
metadata={ metadata={
"episode_number": episode_number, "episode_number": episode_number,
"episode_title": title, "episode_title": title
"episode_downloads": downloads,
"episode_url": f"https://darknetdiaries.com/episode/{episode_number}/"
} }
) )
documents.append(document) documents.append(document)
@@ -94,13 +87,16 @@ refine_template = ChatPromptTemplate(chat_refine_msgs)
chat_engine = index.as_chat_engine( chat_engine = index.as_chat_engine(
text_qa_template=text_qa_template, text_qa_template=text_qa_template,
refine_template=refine_template, refine_template=refine_template
chat_mode=ChatMode.OPENAI
) )
while True: while True:
try: try:
chat_engine.chat_repl() user_prompt = input("Prompt: ")
streaming_response = chat_engine.stream_chat(user_prompt)
for token in streaming_response.response_gen:
print(token, end="")
print("\n")
except KeyboardInterrupt: except KeyboardInterrupt:
break break