From 0730db572785e2639a6454c601dca67b08b4a311 Mon Sep 17 00:00:00 2001 From: EndMove Date: Sat, 7 Oct 2023 15:14:18 +0200 Subject: [PATCH 1/3] feat: package manager --- .editorconfig | 14 ++++++++++++++ .gitignore | 3 ++- README.MD => README.md | 15 +++++++++++---- main.py | 3 +-- requirements.txt | 9 +++++++++ 5 files changed, 37 insertions(+), 7 deletions(-) create mode 100644 .editorconfig rename README.MD => README.md (90%) create mode 100644 requirements.txt diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..c03a972 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,14 @@ +root = true + +[*] +charset = utf-8 +end_of_line = lf +indent_size = 4 +indent_style = space +insert_final_newline = true +trim_trailing_whitespace = true +max_line_length = 120 + +[*.md] +trim_trailing_whitespace = false +max_line_length = 0 \ No newline at end of file diff --git a/.gitignore b/.gitignore index 70df3e4..64e03f3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ /transcripts /index -/.idea \ No newline at end of file +/.idea +/venv diff --git a/README.MD b/README.md similarity index 90% rename from README.MD rename to README.md index a465238..0efe6b3 100644 --- a/README.MD +++ b/README.md @@ -9,9 +9,15 @@ Well, let's ask our LLM: ## How to run ### Install dependencies -I have no idea what the correct way to install dependencies with python is. Somehow install these libraries and their dependencies: -- llama_index -- beautifulsoup4 +It is recommended to use a python version greater than or equal to ``3.10.0``. +Another stuff recommended, is to create a venv or use an IDE that supports venv creation, so all dependencies are installed locally to the project and not globally. If not, you can use https://virtualenv.pypa.io/en/latest/ to artificially create isolated environments. + +Install the dependencies required to run the project by running the following command at the project root : + +```shell +pip install -r requirements.txt +``` + ### Execution Download transcripts: ```shell @@ -31,6 +37,7 @@ python3 main.py On the first run, it will generate the index. This can take a while, but it will be cached on disk for the next runs. You can then ask it any questions about Darknet Diaries! + ## Examples > What is the intro of the podcast? @@ -69,4 +76,4 @@ You can then ask it any questions about Darknet Diaries! >>The episode also covers a project where Jason was tasked with hacking into a large, worldwide bank. His job was to examine the bank's mobile app for any potential security vulnerabilities that could expose customer or sensitive information. The episode provides a detailed look into the world of penetration testing, highlighting the importance of robust security measures in both physical and digital spaces. > > How many downloads does this episode have? ->> Episode 130 of Darknet Diaries, titled "JASON'S PEN TEST", has 667,528 downloads. \ No newline at end of file +>> Episode 130 of Darknet Diaries, titled "JASON'S PEN TEST", has 667,528 downloads. diff --git a/main.py b/main.py index f52238e..b7785ad 100644 --- a/main.py +++ b/main.py @@ -4,7 +4,7 @@ from llama_index.node_parser import SimpleNodeParser from llama_index import VectorStoreIndex from llama_index.llms import OpenAI, ChatMessage, MessageRole from llama_index.prompts import ChatPromptTemplate -from llama_index import set_global_handler +# from llama_index import set_global_handler from llama_index.chat_engine.types import ChatMode import os import re @@ -103,4 +103,3 @@ while True: chat_engine.chat_repl() except KeyboardInterrupt: break - diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..2c96fbe --- /dev/null +++ b/requirements.txt @@ -0,0 +1,9 @@ +# ===================== +# Required dependencies +# ===================== +llama-index==0.8.* +beautifulsoup4==4.12.* + +# ===================== +# Development dependencies +# ===================== -- 2.45.2 From 6d7600e47e00bf9cfffe8adadda546a51b0e9bcd Mon Sep 17 00:00:00 2001 From: EndMove Date: Sat, 7 Oct 2023 16:55:18 +0200 Subject: [PATCH 2/3] feat: ups deps, add dotenv --- .gitignore | 1 + download_transcripts.py | 42 +++++----- main.py | 174 +++++++++++++++++++++------------------- requirements.txt | 11 ++- 4 files changed, 124 insertions(+), 104 deletions(-) diff --git a/.gitignore b/.gitignore index 64e03f3..35a8713 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +.env /transcripts /index /.idea diff --git a/download_transcripts.py b/download_transcripts.py index 1f38e82..f577250 100644 --- a/download_transcripts.py +++ b/download_transcripts.py @@ -5,26 +5,30 @@ import json folder_path = "transcripts" -if not os.path.exists(folder_path): - os.makedirs(folder_path) +if __name__ == '__main__': + if not os.path.exists(folder_path): + os.makedirs(folder_path) -for i in range(1, 139): - try: - url = f"https://darknetdiaries.com/transcript/{i}" - r = requests.get(url) - soup = BeautifulSoup(r.text, 'html.parser') + for i in range(1, 139): + try: + # fetch transcript + url = f"https://darknetdiaries.com/transcript/{i}" + r = requests.get(url) + soup = BeautifulSoup(r.text, 'html.parser') - transcript = soup.find('pre').get_text() + transcript = soup.find('pre').get_text() - url = f"https://api.darknetdiaries.com/{i}.json" - r = requests.get(url) - parsed_json = json.loads(r.text) - title = parsed_json["episode_name"] - number = parsed_json["episode_number"] - downloads = parsed_json["total_downloads"] + # fetch transcript metadata + url = f"https://api.darknetdiaries.com/{i}.json" + r = requests.get(url) + parsed_json = json.loads(r.text) + title = parsed_json["episode_name"] + number = parsed_json["episode_number"] + downloads = parsed_json["total_downloads"] - with open(f"{folder_path}/episode_{number}.txt", "w") as f: - f.write(f"{title}\n{downloads}\n{transcript}") - print(f"{number} {title}") - except Exception: - print(f"Failed scraping episode {i}") + # write transcript + with open(f"{folder_path}/episode_{number}.txt", "w", encoding='utf-8') as f: + f.write(f"{title}\n{downloads}\n{transcript}") + print(f"{number} {title}") + except Exception as err: + print(f"Failed scraping episode {i} : [{err}]") diff --git a/main.py b/main.py index b7785ad..c5886d2 100644 --- a/main.py +++ b/main.py @@ -6,100 +6,108 @@ from llama_index.llms import OpenAI, ChatMessage, MessageRole from llama_index.prompts import ChatPromptTemplate # from llama_index import set_global_handler from llama_index.chat_engine.types import ChatMode +from dotenv import load_dotenv import os import re # set_global_handler("simple") -llm = OpenAI(model="gpt-4", temperature=0, max_tokens=256) +# load .env +load_dotenv() +OPEN_API_KEY = os.getenv('OPEN_API_KEY') + +# config llm context +llm = OpenAI(model="gpt-4", temperature=0, max_tokens=256, api_key="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") service_context = ServiceContext.from_defaults(llm=llm) set_global_service_context(service_context) -if not os.path.exists("./index/lock"): - documents = [] - for filename in os.listdir("./transcripts"): - episode_number = re.search(r'\d+', filename).group() - with open("./transcripts/" + filename, 'r') as f: - title = f.readline().strip() - downloads = f.readline().strip() - content = f.read() - document = Document( - text=content, - doc_id=filename, - metadata={ - "episode_number": episode_number, - "episode_title": title, - "episode_downloads": downloads, - "episode_url": f"https://darknetdiaries.com/episode/{episode_number}/" - } - ) - documents.append(document) +# TODO split in small functions +if __name__ == '__main__': + if not os.path.exists("./index/lock"): + documents = [] + for filename in os.listdir("./transcripts"): + episode_number = re.search(r'\d+', filename).group() + with open("./transcripts/" + filename, 'r') as f: + title = f.readline().strip() + downloads = f.readline().strip() + content = f.read() + document = Document( + text=content, + doc_id=filename, + metadata={ + "episode_number": episode_number, + "episode_title": title, + "episode_downloads": downloads, + "episode_url": f"https://darknetdiaries.com/episode/{episode_number}/" + } + ) + documents.append(document) - parser = SimpleNodeParser.from_defaults() - nodes = parser.get_nodes_from_documents(documents) + parser = SimpleNodeParser.from_defaults() + nodes = parser.get_nodes_from_documents(documents) - index = VectorStoreIndex(nodes, show_progress=True) - index.storage_context.persist(persist_dir="./index") - open("./index/lock", 'a').close() -else: - print("Loading index...") - storage_context = StorageContext.from_defaults(persist_dir="./index") - index = load_index_from_storage(storage_context) + index = VectorStoreIndex(nodes, show_progress=True) + index.storage_context.persist(persist_dir="./index") + open("./index/lock", 'a').close() + else: + print("Loading index...") + storage_context = StorageContext.from_defaults(persist_dir="./index") + index = load_index_from_storage(storage_context) -chat_text_qa_msgs = [ - ChatMessage( - role=MessageRole.SYSTEM, - content=( - "You have been trained on the Darknet Diaries podcast transcripts with data from october 6 2023." - "You are an expert about it and will answer as such. You know about every episode up to number 138." - "Always answer the question, even if the context isn't helpful." - "Mention the number and title of the episodes you are referring to." - ) - ), - ChatMessage( - role=MessageRole.USER, - content=( - "Context information is below.\n" - "---------------------\n" - "{context_str}\n" - "---------------------\n" - "Given the context information and not prior knowledge," - "answer the question: {query_str}\n" - ) - ) -] -text_qa_template = ChatPromptTemplate(chat_text_qa_msgs) - -chat_refine_msgs = [ - ChatMessage( - role=MessageRole.SYSTEM, - content="Always answer the question, even if the context isn't helpful.", - ), - ChatMessage( - role=MessageRole.USER, - content=( - "We have the opportunity to refine the original answer " - "(only if needed) with some more context below.\n" - "------------\n" - "{context_msg}\n" - "------------\n" - "Given the new context, refine the original answer to better " - "answer the question: {query_str}. " - "If the context isn't useful, output the original answer again.\n" - "Original Answer: {existing_answer}" + chat_text_qa_msgs = [ + ChatMessage( + role=MessageRole.SYSTEM, + content=( + "You have been trained on the Darknet Diaries podcast transcripts with data from october 6 2023." + "You are an expert about it and will answer as such. You know about every episode up to number 138." + "Always answer the question, even if the context isn't helpful." + "Mention the number and title of the episodes you are referring to." + ) ), - ), -] -refine_template = ChatPromptTemplate(chat_refine_msgs) + ChatMessage( + role=MessageRole.USER, + content=( + "Context information is below.\n" + "---------------------\n" + "{context_str}\n" + "---------------------\n" + "Given the context information and not prior knowledge," + "answer the question: {query_str}\n" + ) + ) + ] + text_qa_template = ChatPromptTemplate(chat_text_qa_msgs) -chat_engine = index.as_chat_engine( - text_qa_template=text_qa_template, - refine_template=refine_template, - chat_mode=ChatMode.OPENAI -) + chat_refine_msgs = [ + ChatMessage( + role=MessageRole.SYSTEM, + content="Always answer the question, even if the context isn't helpful.", + ), + ChatMessage( + role=MessageRole.USER, + content=( + "We have the opportunity to refine the original answer " + "(only if needed) with some more context below.\n" + "------------\n" + "{context_msg}\n" + "------------\n" + "Given the new context, refine the original answer to better " + "answer the question: {query_str}. " + "If the context isn't useful, output the original answer again.\n" + "Original Answer: {existing_answer}" + ), + ), + ] + refine_template = ChatPromptTemplate(chat_refine_msgs) -while True: - try: - chat_engine.chat_repl() - except KeyboardInterrupt: - break + chat_engine = index.as_chat_engine( + text_qa_template=text_qa_template, + refine_template=refine_template, + chat_mode=ChatMode.OPENAI + ) + + while True: + try: + chat_engine.chat_repl() + except KeyboardInterrupt: + break diff --git a/requirements.txt b/requirements.txt index 2c96fbe..9a52cf2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,15 @@ # ===================== # Required dependencies # ===================== -llama-index==0.8.* -beautifulsoup4==4.12.* +# general deps +requests~=2.31.0 +llama-index~=0.8.40 +beautifulsoup4~=4.12.2 +python-dotenv~=1.0.0 + +# llama sub deps +transformers~=4.34.0 +torch~=2.1.0 # ===================== # Development dependencies -- 2.45.2 From 7774fc269822a87572a059ae53d079cd5d80656c Mon Sep 17 00:00:00 2001 From: "NIHART, Jeremi" Date: Mon, 9 Oct 2023 10:06:56 +0200 Subject: [PATCH 3/3] fix: pr comments --- main.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/main.py b/main.py index c5886d2..082995e 100644 --- a/main.py +++ b/main.py @@ -17,11 +17,10 @@ load_dotenv() OPEN_API_KEY = os.getenv('OPEN_API_KEY') # config llm context -llm = OpenAI(model="gpt-4", temperature=0, max_tokens=256, api_key="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") +llm = OpenAI(model="gpt-4", temperature=0, max_tokens=256, api_key=OPEN_API_KEY) service_context = ServiceContext.from_defaults(llm=llm) set_global_service_context(service_context) -# TODO split in small functions if __name__ == '__main__': if not os.path.exists("./index/lock"): documents = [] -- 2.45.2