Compare commits

..

No commits in common. "deps-managment-and-dotenv" and "master" have entirely different histories.

6 changed files with 110 additions and 159 deletions

View File

@ -1,14 +0,0 @@
root = true
[*]
charset = utf-8
end_of_line = lf
indent_size = 4
indent_style = space
insert_final_newline = true
trim_trailing_whitespace = true
max_line_length = 120
[*.md]
trim_trailing_whitespace = false
max_line_length = 0

2
.gitignore vendored
View File

@ -1,5 +1,3 @@
.env
/transcripts /transcripts
/index /index
/.idea /.idea
/venv

View File

@ -9,15 +9,9 @@ Well, let's ask our LLM:
## How to run ## How to run
### Install dependencies ### Install dependencies
It is recommended to use a python version greater than or equal to ``3.10.0``. I have no idea what the correct way to install dependencies with python is. Somehow install these libraries and their dependencies:
Another stuff recommended, is to create a venv or use an IDE that supports venv creation, so all dependencies are installed locally to the project and not globally. If not, you can use https://virtualenv.pypa.io/en/latest/ to artificially create isolated environments. - llama_index
- beautifulsoup4
Install the dependencies required to run the project by running the following command at the project root :
```shell
pip install -r requirements.txt
```
### Execution ### Execution
Download transcripts: Download transcripts:
```shell ```shell
@ -37,7 +31,6 @@ python3 main.py
On the first run, it will generate the index. This can take a while, but it will be cached on disk for the next runs. On the first run, it will generate the index. This can take a while, but it will be cached on disk for the next runs.
You can then ask it any questions about Darknet Diaries! You can then ask it any questions about Darknet Diaries!
## Examples ## Examples
> What is the intro of the podcast? > What is the intro of the podcast?

View File

@ -5,20 +5,17 @@ import json
folder_path = "transcripts" folder_path = "transcripts"
if __name__ == '__main__': if not os.path.exists(folder_path):
if not os.path.exists(folder_path):
os.makedirs(folder_path) os.makedirs(folder_path)
for i in range(1, 139): for i in range(1, 139):
try: try:
# fetch transcript
url = f"https://darknetdiaries.com/transcript/{i}" url = f"https://darknetdiaries.com/transcript/{i}"
r = requests.get(url) r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser') soup = BeautifulSoup(r.text, 'html.parser')
transcript = soup.find('pre').get_text() transcript = soup.find('pre').get_text()
# fetch transcript metadata
url = f"https://api.darknetdiaries.com/{i}.json" url = f"https://api.darknetdiaries.com/{i}.json"
r = requests.get(url) r = requests.get(url)
parsed_json = json.loads(r.text) parsed_json = json.loads(r.text)
@ -26,9 +23,8 @@ if __name__ == '__main__':
number = parsed_json["episode_number"] number = parsed_json["episode_number"]
downloads = parsed_json["total_downloads"] downloads = parsed_json["total_downloads"]
# write transcript with open(f"{folder_path}/episode_{number}.txt", "w") as f:
with open(f"{folder_path}/episode_{number}.txt", "w", encoding='utf-8') as f:
f.write(f"{title}\n{downloads}\n{transcript}") f.write(f"{title}\n{downloads}\n{transcript}")
print(f"{number} {title}") print(f"{number} {title}")
except Exception as err: except Exception:
print(f"Failed scraping episode {i} : [{err}]") print(f"Failed scraping episode {i}")

34
main.py
View File

@ -4,25 +4,18 @@ from llama_index.node_parser import SimpleNodeParser
from llama_index import VectorStoreIndex from llama_index import VectorStoreIndex
from llama_index.llms import OpenAI, ChatMessage, MessageRole from llama_index.llms import OpenAI, ChatMessage, MessageRole
from llama_index.prompts import ChatPromptTemplate from llama_index.prompts import ChatPromptTemplate
# from llama_index import set_global_handler from llama_index import set_global_handler
from llama_index.chat_engine.types import ChatMode from llama_index.chat_engine.types import ChatMode
from dotenv import load_dotenv
import os import os
import re import re
# set_global_handler("simple") # set_global_handler("simple")
# load .env llm = OpenAI(model="gpt-4", temperature=0, max_tokens=256)
load_dotenv()
OPEN_API_KEY = os.getenv('OPEN_API_KEY')
# config llm context
llm = OpenAI(model="gpt-4", temperature=0, max_tokens=256, api_key=OPEN_API_KEY)
service_context = ServiceContext.from_defaults(llm=llm) service_context = ServiceContext.from_defaults(llm=llm)
set_global_service_context(service_context) set_global_service_context(service_context)
if __name__ == '__main__': if not os.path.exists("./index/lock"):
if not os.path.exists("./index/lock"):
documents = [] documents = []
for filename in os.listdir("./transcripts"): for filename in os.listdir("./transcripts"):
episode_number = re.search(r'\d+', filename).group() episode_number = re.search(r'\d+', filename).group()
@ -48,12 +41,12 @@ if __name__ == '__main__':
index = VectorStoreIndex(nodes, show_progress=True) index = VectorStoreIndex(nodes, show_progress=True)
index.storage_context.persist(persist_dir="./index") index.storage_context.persist(persist_dir="./index")
open("./index/lock", 'a').close() open("./index/lock", 'a').close()
else: else:
print("Loading index...") print("Loading index...")
storage_context = StorageContext.from_defaults(persist_dir="./index") storage_context = StorageContext.from_defaults(persist_dir="./index")
index = load_index_from_storage(storage_context) index = load_index_from_storage(storage_context)
chat_text_qa_msgs = [ chat_text_qa_msgs = [
ChatMessage( ChatMessage(
role=MessageRole.SYSTEM, role=MessageRole.SYSTEM,
content=( content=(
@ -74,10 +67,10 @@ if __name__ == '__main__':
"answer the question: {query_str}\n" "answer the question: {query_str}\n"
) )
) )
] ]
text_qa_template = ChatPromptTemplate(chat_text_qa_msgs) text_qa_template = ChatPromptTemplate(chat_text_qa_msgs)
chat_refine_msgs = [ chat_refine_msgs = [
ChatMessage( ChatMessage(
role=MessageRole.SYSTEM, role=MessageRole.SYSTEM,
content="Always answer the question, even if the context isn't helpful.", content="Always answer the question, even if the context isn't helpful.",
@ -96,17 +89,18 @@ if __name__ == '__main__':
"Original Answer: {existing_answer}" "Original Answer: {existing_answer}"
), ),
), ),
] ]
refine_template = ChatPromptTemplate(chat_refine_msgs) refine_template = ChatPromptTemplate(chat_refine_msgs)
chat_engine = index.as_chat_engine( chat_engine = index.as_chat_engine(
text_qa_template=text_qa_template, text_qa_template=text_qa_template,
refine_template=refine_template, refine_template=refine_template,
chat_mode=ChatMode.OPENAI chat_mode=ChatMode.OPENAI
) )
while True: while True:
try: try:
chat_engine.chat_repl() chat_engine.chat_repl()
except KeyboardInterrupt: except KeyboardInterrupt:
break break

View File

@ -1,16 +0,0 @@
# =====================
# Required dependencies
# =====================
# general deps
requests~=2.31.0
llama-index~=0.8.40
beautifulsoup4~=4.12.2
python-dotenv~=1.0.0
# llama sub deps
transformers~=4.34.0
torch~=2.1.0
# =====================
# Development dependencies
# =====================