diff --git a/.gitignore b/.gitignore index 64e03f3..35a8713 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +.env /transcripts /index /.idea diff --git a/download_transcripts.py b/download_transcripts.py index 1f38e82..f577250 100644 --- a/download_transcripts.py +++ b/download_transcripts.py @@ -5,26 +5,30 @@ import json folder_path = "transcripts" -if not os.path.exists(folder_path): - os.makedirs(folder_path) +if __name__ == '__main__': + if not os.path.exists(folder_path): + os.makedirs(folder_path) -for i in range(1, 139): - try: - url = f"https://darknetdiaries.com/transcript/{i}" - r = requests.get(url) - soup = BeautifulSoup(r.text, 'html.parser') + for i in range(1, 139): + try: + # fetch transcript + url = f"https://darknetdiaries.com/transcript/{i}" + r = requests.get(url) + soup = BeautifulSoup(r.text, 'html.parser') - transcript = soup.find('pre').get_text() + transcript = soup.find('pre').get_text() - url = f"https://api.darknetdiaries.com/{i}.json" - r = requests.get(url) - parsed_json = json.loads(r.text) - title = parsed_json["episode_name"] - number = parsed_json["episode_number"] - downloads = parsed_json["total_downloads"] + # fetch transcript metadata + url = f"https://api.darknetdiaries.com/{i}.json" + r = requests.get(url) + parsed_json = json.loads(r.text) + title = parsed_json["episode_name"] + number = parsed_json["episode_number"] + downloads = parsed_json["total_downloads"] - with open(f"{folder_path}/episode_{number}.txt", "w") as f: - f.write(f"{title}\n{downloads}\n{transcript}") - print(f"{number} {title}") - except Exception: - print(f"Failed scraping episode {i}") + # write transcript + with open(f"{folder_path}/episode_{number}.txt", "w", encoding='utf-8') as f: + f.write(f"{title}\n{downloads}\n{transcript}") + print(f"{number} {title}") + except Exception as err: + print(f"Failed scraping episode {i} : [{err}]") diff --git a/main.py b/main.py index b7785ad..c5886d2 100644 --- a/main.py +++ b/main.py @@ -6,100 +6,108 @@ from llama_index.llms import OpenAI, ChatMessage, MessageRole from llama_index.prompts import ChatPromptTemplate # from llama_index import set_global_handler from llama_index.chat_engine.types import ChatMode +from dotenv import load_dotenv import os import re # set_global_handler("simple") -llm = OpenAI(model="gpt-4", temperature=0, max_tokens=256) +# load .env +load_dotenv() +OPEN_API_KEY = os.getenv('OPEN_API_KEY') + +# config llm context +llm = OpenAI(model="gpt-4", temperature=0, max_tokens=256, api_key="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") service_context = ServiceContext.from_defaults(llm=llm) set_global_service_context(service_context) -if not os.path.exists("./index/lock"): - documents = [] - for filename in os.listdir("./transcripts"): - episode_number = re.search(r'\d+', filename).group() - with open("./transcripts/" + filename, 'r') as f: - title = f.readline().strip() - downloads = f.readline().strip() - content = f.read() - document = Document( - text=content, - doc_id=filename, - metadata={ - "episode_number": episode_number, - "episode_title": title, - "episode_downloads": downloads, - "episode_url": f"https://darknetdiaries.com/episode/{episode_number}/" - } - ) - documents.append(document) +# TODO split in small functions +if __name__ == '__main__': + if not os.path.exists("./index/lock"): + documents = [] + for filename in os.listdir("./transcripts"): + episode_number = re.search(r'\d+', filename).group() + with open("./transcripts/" + filename, 'r') as f: + title = f.readline().strip() + downloads = f.readline().strip() + content = f.read() + document = Document( + text=content, + doc_id=filename, + metadata={ + "episode_number": episode_number, + "episode_title": title, + "episode_downloads": downloads, + "episode_url": f"https://darknetdiaries.com/episode/{episode_number}/" + } + ) + documents.append(document) - parser = SimpleNodeParser.from_defaults() - nodes = parser.get_nodes_from_documents(documents) + parser = SimpleNodeParser.from_defaults() + nodes = parser.get_nodes_from_documents(documents) - index = VectorStoreIndex(nodes, show_progress=True) - index.storage_context.persist(persist_dir="./index") - open("./index/lock", 'a').close() -else: - print("Loading index...") - storage_context = StorageContext.from_defaults(persist_dir="./index") - index = load_index_from_storage(storage_context) + index = VectorStoreIndex(nodes, show_progress=True) + index.storage_context.persist(persist_dir="./index") + open("./index/lock", 'a').close() + else: + print("Loading index...") + storage_context = StorageContext.from_defaults(persist_dir="./index") + index = load_index_from_storage(storage_context) -chat_text_qa_msgs = [ - ChatMessage( - role=MessageRole.SYSTEM, - content=( - "You have been trained on the Darknet Diaries podcast transcripts with data from october 6 2023." - "You are an expert about it and will answer as such. You know about every episode up to number 138." - "Always answer the question, even if the context isn't helpful." - "Mention the number and title of the episodes you are referring to." - ) - ), - ChatMessage( - role=MessageRole.USER, - content=( - "Context information is below.\n" - "---------------------\n" - "{context_str}\n" - "---------------------\n" - "Given the context information and not prior knowledge," - "answer the question: {query_str}\n" - ) - ) -] -text_qa_template = ChatPromptTemplate(chat_text_qa_msgs) - -chat_refine_msgs = [ - ChatMessage( - role=MessageRole.SYSTEM, - content="Always answer the question, even if the context isn't helpful.", - ), - ChatMessage( - role=MessageRole.USER, - content=( - "We have the opportunity to refine the original answer " - "(only if needed) with some more context below.\n" - "------------\n" - "{context_msg}\n" - "------------\n" - "Given the new context, refine the original answer to better " - "answer the question: {query_str}. " - "If the context isn't useful, output the original answer again.\n" - "Original Answer: {existing_answer}" + chat_text_qa_msgs = [ + ChatMessage( + role=MessageRole.SYSTEM, + content=( + "You have been trained on the Darknet Diaries podcast transcripts with data from october 6 2023." + "You are an expert about it and will answer as such. You know about every episode up to number 138." + "Always answer the question, even if the context isn't helpful." + "Mention the number and title of the episodes you are referring to." + ) ), - ), -] -refine_template = ChatPromptTemplate(chat_refine_msgs) + ChatMessage( + role=MessageRole.USER, + content=( + "Context information is below.\n" + "---------------------\n" + "{context_str}\n" + "---------------------\n" + "Given the context information and not prior knowledge," + "answer the question: {query_str}\n" + ) + ) + ] + text_qa_template = ChatPromptTemplate(chat_text_qa_msgs) -chat_engine = index.as_chat_engine( - text_qa_template=text_qa_template, - refine_template=refine_template, - chat_mode=ChatMode.OPENAI -) + chat_refine_msgs = [ + ChatMessage( + role=MessageRole.SYSTEM, + content="Always answer the question, even if the context isn't helpful.", + ), + ChatMessage( + role=MessageRole.USER, + content=( + "We have the opportunity to refine the original answer " + "(only if needed) with some more context below.\n" + "------------\n" + "{context_msg}\n" + "------------\n" + "Given the new context, refine the original answer to better " + "answer the question: {query_str}. " + "If the context isn't useful, output the original answer again.\n" + "Original Answer: {existing_answer}" + ), + ), + ] + refine_template = ChatPromptTemplate(chat_refine_msgs) -while True: - try: - chat_engine.chat_repl() - except KeyboardInterrupt: - break + chat_engine = index.as_chat_engine( + text_qa_template=text_qa_template, + refine_template=refine_template, + chat_mode=ChatMode.OPENAI + ) + + while True: + try: + chat_engine.chat_repl() + except KeyboardInterrupt: + break diff --git a/requirements.txt b/requirements.txt index 2c96fbe..9a52cf2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,15 @@ # ===================== # Required dependencies # ===================== -llama-index==0.8.* -beautifulsoup4==4.12.* +# general deps +requests~=2.31.0 +llama-index~=0.8.40 +beautifulsoup4~=4.12.2 +python-dotenv~=1.0.0 + +# llama sub deps +transformers~=4.34.0 +torch~=2.1.0 # ===================== # Development dependencies