Compare commits
6 Commits
fdf987c342
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
fbcf0168f9 | ||
|
|
1a46ea4816 | ||
|
|
bf3fd878ac | ||
|
|
c47ff3d9ed | ||
|
|
a582d89c57 | ||
|
|
d0dd93a5ab |
19
README.MD
19
README.MD
@@ -2,8 +2,10 @@
|
|||||||
## What is Darknet Diaries?
|
## What is Darknet Diaries?
|
||||||
Well, let's ask our LLM:
|
Well, let's ask our LLM:
|
||||||
|
|
||||||
> > What is Darknet Diaries?
|
> What is Darknet Diaries?
|
||||||
> > Darknet Diaries is a podcast that explores true stories from the dark side of the internet. Hosted by Jack Rhysider, each episode delves into various topics related to hacking, cybercrime, and the underground world of the internet. The podcast features interviews with individuals who have been involved in hacking incidents, cyber espionage, or have experienced the consequences of online criminal activities. Darknet Diaries aims to educate and entertain listeners by providing an in-depth look into the fascinating and often mysterious world of the darknet.
|
>> Darknet Diaries is a podcast that explores true stories from the dark side of the internet. Hosted by Jack Rhysider, each episode delves into various topics related to hacking, cybercrime, and the underground world of the internet. The podcast features interviews with individuals who have been involved in hacking incidents, cyber espionage, or have experienced the consequences of online criminal activities. Darknet Diaries aims to educate and entertain listeners by providing an in-depth look into the fascinating and often mysterious world of the darknet.
|
||||||
|
|
||||||
|
[darknetdiaries.com](http://darknetdiaries.com/)
|
||||||
|
|
||||||
## How to run
|
## How to run
|
||||||
### Install dependencies
|
### Install dependencies
|
||||||
@@ -28,10 +30,14 @@ python3 main.py
|
|||||||
|
|
||||||
On the first run, it will generate the index. This can take a while, but it will be cached on disk for the next runs.
|
On the first run, it will generate the index. This can take a while, but it will be cached on disk for the next runs.
|
||||||
|
|
||||||
You can then ask it any questions about Darknet Diaries! Currently, it does hallucinate a lot about episode numbers and titles. Other than that, it's pretty accurate!
|
You can then ask it any questions about Darknet Diaries!
|
||||||
|
|
||||||
## Examples
|
## Examples
|
||||||
|
|
||||||
|
> What is the intro of the podcast?
|
||||||
|
>> Typically, the host Jack Rhysider starts the podcast with a brief introduction about the episode's topic, followed by "These are true stories from the dark side of the internet. I'm Jack Rhysider. This is Darknet Diaries." Please note that the exact wording may vary from episode to episode.
|
||||||
|
|
||||||
|
(this is exactly the intro)
|
||||||
|
|
||||||
### Episode 137: Predator
|
### Episode 137: Predator
|
||||||
[link to transcript](https://darknetdiaries.com/transcript/137/)
|
[link to transcript](https://darknetdiaries.com/transcript/137/)
|
||||||
|
|
||||||
@@ -60,4 +66,7 @@ You can then ask it any questions about Darknet Diaries! Currently, it does hall
|
|||||||
>>
|
>>
|
||||||
>>One of his memorable stories involves a physical penetration test where he had to break into a former employer's building. He used his knowledge of the building's layout and security mechanisms to gain access, even falling through a ceiling into a server room at one point.
|
>>One of his memorable stories involves a physical penetration test where he had to break into a former employer's building. He used his knowledge of the building's layout and security mechanisms to gain access, even falling through a ceiling into a server room at one point.
|
||||||
>>
|
>>
|
||||||
>>The episode also covers a project where Jason was tasked with hacking into a large, worldwide bank. His job was to examine the bank's mobile app for any potential security vulnerabilities that could expose customer or sensitive information. The episode provides a detailed look into the world of penetration testing, highlighting the importance of robust security measures in both physical and digital spaces.
|
>>The episode also covers a project where Jason was tasked with hacking into a large, worldwide bank. His job was to examine the bank's mobile app for any potential security vulnerabilities that could expose customer or sensitive information. The episode provides a detailed look into the world of penetration testing, highlighting the importance of robust security measures in both physical and digital spaces.
|
||||||
|
>
|
||||||
|
> How many downloads does this episode have?
|
||||||
|
>> Episode 130 of Darknet Diaries, titled "JASON'S PEN TEST", has 667,528 downloads.
|
||||||
@@ -1,6 +1,7 @@
|
|||||||
import requests
|
import requests
|
||||||
import os
|
import os
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
import json
|
||||||
|
|
||||||
folder_path = "transcripts"
|
folder_path = "transcripts"
|
||||||
|
|
||||||
@@ -8,17 +9,22 @@ if not os.path.exists(folder_path):
|
|||||||
os.makedirs(folder_path)
|
os.makedirs(folder_path)
|
||||||
|
|
||||||
for i in range(1, 139):
|
for i in range(1, 139):
|
||||||
url = f"https://darknetdiaries.com/transcript/{i}"
|
try:
|
||||||
r = requests.get(url)
|
url = f"https://darknetdiaries.com/transcript/{i}"
|
||||||
soup = BeautifulSoup(r.text, 'html.parser')
|
r = requests.get(url)
|
||||||
pre_section = soup.find('pre')
|
soup = BeautifulSoup(r.text, 'html.parser')
|
||||||
title_section = soup.find('h1')
|
|
||||||
|
|
||||||
if pre_section:
|
transcript = soup.find('pre').get_text()
|
||||||
transcript = pre_section.get_text()
|
|
||||||
ep, title = title_section.get_text().split(":", 1)
|
url = f"https://api.darknetdiaries.com/{i}.json"
|
||||||
ep = ep.strip()
|
r = requests.get(url)
|
||||||
title = title.strip()
|
parsed_json = json.loads(r.text)
|
||||||
with open(f"{folder_path}/episode_{i}.txt", "w") as f:
|
title = parsed_json["episode_name"]
|
||||||
f.write(f"{title}\n{transcript}")
|
number = parsed_json["episode_number"]
|
||||||
print(f"{ep} {title}")
|
downloads = parsed_json["total_downloads"]
|
||||||
|
|
||||||
|
with open(f"{folder_path}/episode_{number}.txt", "w") as f:
|
||||||
|
f.write(f"{title}\n{downloads}\n{transcript}")
|
||||||
|
print(f"{number} {title}")
|
||||||
|
except Exception:
|
||||||
|
print(f"Failed scraping episode {i}")
|
||||||
|
|||||||
20
main.py
20
main.py
@@ -1,12 +1,16 @@
|
|||||||
from llama_index import (SimpleDirectoryReader, ServiceContext, StorageContext,
|
from llama_index import (ServiceContext, StorageContext,
|
||||||
load_index_from_storage, Document, set_global_service_context)
|
load_index_from_storage, Document, set_global_service_context)
|
||||||
from llama_index.node_parser import SimpleNodeParser
|
from llama_index.node_parser import SimpleNodeParser
|
||||||
from llama_index import VectorStoreIndex
|
from llama_index import VectorStoreIndex
|
||||||
from llama_index.llms import OpenAI, ChatMessage, MessageRole
|
from llama_index.llms import OpenAI, ChatMessage, MessageRole
|
||||||
from llama_index.prompts import ChatPromptTemplate
|
from llama_index.prompts import ChatPromptTemplate
|
||||||
|
from llama_index import set_global_handler
|
||||||
|
from llama_index.chat_engine.types import ChatMode
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
# set_global_handler("simple")
|
||||||
|
|
||||||
llm = OpenAI(model="gpt-4", temperature=0, max_tokens=256)
|
llm = OpenAI(model="gpt-4", temperature=0, max_tokens=256)
|
||||||
service_context = ServiceContext.from_defaults(llm=llm)
|
service_context = ServiceContext.from_defaults(llm=llm)
|
||||||
set_global_service_context(service_context)
|
set_global_service_context(service_context)
|
||||||
@@ -17,13 +21,16 @@ if not os.path.exists("./index/lock"):
|
|||||||
episode_number = re.search(r'\d+', filename).group()
|
episode_number = re.search(r'\d+', filename).group()
|
||||||
with open("./transcripts/" + filename, 'r') as f:
|
with open("./transcripts/" + filename, 'r') as f:
|
||||||
title = f.readline().strip()
|
title = f.readline().strip()
|
||||||
|
downloads = f.readline().strip()
|
||||||
content = f.read()
|
content = f.read()
|
||||||
document = Document(
|
document = Document(
|
||||||
text=content,
|
text=content,
|
||||||
doc_id=filename,
|
doc_id=filename,
|
||||||
metadata={
|
metadata={
|
||||||
"episode_number": episode_number,
|
"episode_number": episode_number,
|
||||||
"episode_title": title
|
"episode_title": title,
|
||||||
|
"episode_downloads": downloads,
|
||||||
|
"episode_url": f"https://darknetdiaries.com/episode/{episode_number}/"
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
documents.append(document)
|
documents.append(document)
|
||||||
@@ -87,16 +94,13 @@ refine_template = ChatPromptTemplate(chat_refine_msgs)
|
|||||||
|
|
||||||
chat_engine = index.as_chat_engine(
|
chat_engine = index.as_chat_engine(
|
||||||
text_qa_template=text_qa_template,
|
text_qa_template=text_qa_template,
|
||||||
refine_template=refine_template
|
refine_template=refine_template,
|
||||||
|
chat_mode=ChatMode.OPENAI
|
||||||
)
|
)
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
user_prompt = input("Prompt: ")
|
chat_engine.chat_repl()
|
||||||
streaming_response = chat_engine.stream_chat(user_prompt)
|
|
||||||
for token in streaming_response.response_gen:
|
|
||||||
print(token, end="")
|
|
||||||
print("\n")
|
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user