6 changed files with 110 additions and 159 deletions
--- a/.editorconfig
+++ b/.editorconfig
@ -1,14 +0,0 @@
 root = true
 [*]
 charset = utf-8
 end_of_line = lf
 indent_size = 4
 indent_style = space
 insert_final_newline = true
 trim_trailing_whitespace = true
 max_line_length = 120
 [*.md]
 trim_trailing_whitespace = false
 max_line_length = 0
--- a/.gitignore
+++ b/.gitignore
@ -1,5 +1,3 @@
 .env
 /transcripts
 /index
 /.idea
 /venv
--- a/README.MD
+++ b/README.MD
@ -9,15 +9,9 @@ Well, let's ask our LLM:
 ## How to run
 ### Install dependencies
-It is recommended to use a python version greater than or equal to ``3.10.0``.
+I have no idea what the correct way to install dependencies with python is. Somehow install these libraries and their dependencies:
-Another stuff recommended, is to create a venv or use an IDE that supports venv creation, so all dependencies are installed locally to the project and not globally. If not, you can use https://virtualenv.pypa.io/en/latest/ to artificially create isolated environments.
+- llama_index
-
+- beautifulsoup4
 Install the dependencies required to run the project by running the following command at the project root :
 ```shell
 pip install -r requirements.txt
 ```
 ### Execution
 Download transcripts:
 ```shell
@ -37,7 +31,6 @@ python3 main.py
 On the first run, it will generate the index. This can take a while, but it will be cached on disk for the next runs.
 You can then ask it any questions about Darknet Diaries!
 ## Examples
 > What is the intro of the podcast?
--- a/download_transcripts.py
+++ b/download_transcripts.py
@ -5,30 +5,26 @@ import json
 folder_path = "transcripts"
-if __name__ == '__main__':
+if not os.path.exists(folder_path):
-    if not os.path.exists(folder_path):
+    os.makedirs(folder_path)
        os.makedirs(folder_path)
-    for i in range(1, 139):
+for i in range(1, 139):
-        try:
+    try:
-            # fetch transcript
+        url = f"https://darknetdiaries.com/transcript/{i}"
-            url = f"https://darknetdiaries.com/transcript/{i}"
+        r = requests.get(url)
-            r = requests.get(url)
+        soup = BeautifulSoup(r.text, 'html.parser')
            soup = BeautifulSoup(r.text, 'html.parser')
-            transcript = soup.find('pre').get_text()
+        transcript = soup.find('pre').get_text()
-            # fetch transcript metadata
+        url = f"https://api.darknetdiaries.com/{i}.json"
-            url = f"https://api.darknetdiaries.com/{i}.json"
+        r = requests.get(url)
-            r = requests.get(url)
+        parsed_json = json.loads(r.text)
-            parsed_json = json.loads(r.text)
+        title = parsed_json["episode_name"]
-            title = parsed_json["episode_name"]
+        number = parsed_json["episode_number"]
-            number = parsed_json["episode_number"]
+        downloads = parsed_json["total_downloads"]
            downloads = parsed_json["total_downloads"]
-            # write transcript
+        with open(f"{folder_path}/episode_{number}.txt", "w") as f:
-            with open(f"{folder_path}/episode_{number}.txt", "w", encoding='utf-8') as f:
+            f.write(f"{title}\n{downloads}\n{transcript}")
-                f.write(f"{title}\n{downloads}\n{transcript}")
+        print(f"{number} {title}")
-            print(f"{number} {title}")
+    except Exception:
-        except Exception as err:
+        print(f"Failed scraping episode {i}")
            print(f"Failed scraping episode {i} : [{err}]")
--- a/main.py
+++ b/main.py
@ -4,109 +4,103 @@ from llama_index.node_parser import SimpleNodeParser
 from llama_index import VectorStoreIndex
 from llama_index.llms import OpenAI, ChatMessage, MessageRole
 from llama_index.prompts import ChatPromptTemplate
-# from llama_index import set_global_handler
+from llama_index import set_global_handler
 from llama_index.chat_engine.types import ChatMode
 from dotenv import load_dotenv
 import os
 import re
 # set_global_handler("simple")
-# load .env
+llm = OpenAI(model="gpt-4", temperature=0, max_tokens=256)
 load_dotenv()
 OPEN_API_KEY = os.getenv('OPEN_API_KEY')
 # config llm context
 llm = OpenAI(model="gpt-4", temperature=0, max_tokens=256, api_key=OPEN_API_KEY)
 service_context = ServiceContext.from_defaults(llm=llm)
 set_global_service_context(service_context)
-if __name__ == '__main__':
+if not os.path.exists("./index/lock"):
-    if not os.path.exists("./index/lock"):
+    documents = []
-        documents = []
+    for filename in os.listdir("./transcripts"):
-        for filename in os.listdir("./transcripts"):
+        episode_number = re.search(r'\d+', filename).group()
-            episode_number = re.search(r'\d+', filename).group()
+        with open("./transcripts/" + filename, 'r') as f:
-            with open("./transcripts/" + filename, 'r') as f:
+            title = f.readline().strip()
-                title = f.readline().strip()
+            downloads = f.readline().strip()
-                downloads = f.readline().strip()
+            content = f.read()
-                content = f.read()
+        document = Document(
-            document = Document(
+            text=content,
-                text=content,
+            doc_id=filename,
-                doc_id=filename,
+            metadata={
-                metadata={
+                "episode_number": episode_number,
-                    "episode_number": episode_number,
+                "episode_title": title,
-                    "episode_title": title,
+                "episode_downloads": downloads,
-                    "episode_downloads": downloads,
+                "episode_url": f"https://darknetdiaries.com/episode/{episode_number}/"
-                    "episode_url": f"https://darknetdiaries.com/episode/{episode_number}/"
+            }
                }
            )
            documents.append(document)
        parser = SimpleNodeParser.from_defaults()
        nodes = parser.get_nodes_from_documents(documents)
        index = VectorStoreIndex(nodes, show_progress=True)
        index.storage_context.persist(persist_dir="./index")
        open("./index/lock", 'a').close()
    else:
        print("Loading index...")
        storage_context = StorageContext.from_defaults(persist_dir="./index")
        index = load_index_from_storage(storage_context)
    chat_text_qa_msgs = [
        ChatMessage(
            role=MessageRole.SYSTEM,
            content=(
                "You have been trained on the Darknet Diaries podcast transcripts with data from october 6 2023."
                "You are an expert about it and will answer as such. You know about every episode up to number 138."
                "Always answer the question, even if the context isn't helpful."
                "Mention the number and title of the episodes you are referring to."
            )
        ),
        ChatMessage(
            role=MessageRole.USER,
            content=(
                "Context information is below.\n"
                "---------------------\n"
                "{context_str}\n"
                "---------------------\n"
                "Given the context information and not prior knowledge,"
                "answer the question: {query_str}\n"
            )
        )
-    ]
+        documents.append(document)
    text_qa_template = ChatPromptTemplate(chat_text_qa_msgs)
-    chat_refine_msgs = [
+    parser = SimpleNodeParser.from_defaults()
-        ChatMessage(
+    nodes = parser.get_nodes_from_documents(documents)
            role=MessageRole.SYSTEM,
            content="Always answer the question, even if the context isn't helpful.",
        ),
        ChatMessage(
            role=MessageRole.USER,
            content=(
                "We have the opportunity to refine the original answer "
                "(only if needed) with some more context below.\n"
                "------------\n"
                "{context_msg}\n"
                "------------\n"
                "Given the new context, refine the original answer to better "
                "answer the question: {query_str}. "
                "If the context isn't useful, output the original answer again.\n"
                "Original Answer: {existing_answer}"
            ),
        ),
    ]
    refine_template = ChatPromptTemplate(chat_refine_msgs)
-    chat_engine = index.as_chat_engine(
+    index = VectorStoreIndex(nodes, show_progress=True)
-        text_qa_template=text_qa_template,
+    index.storage_context.persist(persist_dir="./index")
-        refine_template=refine_template,
+    open("./index/lock", 'a').close()
-        chat_mode=ChatMode.OPENAI
+else:
    print("Loading index...")
    storage_context = StorageContext.from_defaults(persist_dir="./index")
    index = load_index_from_storage(storage_context)
 chat_text_qa_msgs = [
    ChatMessage(
        role=MessageRole.SYSTEM,
        content=(
            "You have been trained on the Darknet Diaries podcast transcripts with data from october 6 2023."
            "You are an expert about it and will answer as such. You know about every episode up to number 138."
            "Always answer the question, even if the context isn't helpful."
            "Mention the number and title of the episodes you are referring to."
        )
    ),
    ChatMessage(
        role=MessageRole.USER,
        content=(
            "Context information is below.\n"
            "---------------------\n"
            "{context_str}\n"
            "---------------------\n"
            "Given the context information and not prior knowledge,"
            "answer the question: {query_str}\n"
        )
    )
 ]
 text_qa_template = ChatPromptTemplate(chat_text_qa_msgs)
 chat_refine_msgs = [
    ChatMessage(
        role=MessageRole.SYSTEM,
        content="Always answer the question, even if the context isn't helpful.",
    ),
    ChatMessage(
        role=MessageRole.USER,
        content=(
            "We have the opportunity to refine the original answer "
            "(only if needed) with some more context below.\n"
            "------------\n"
            "{context_msg}\n"
            "------------\n"
            "Given the new context, refine the original answer to better "
            "answer the question: {query_str}. "
            "If the context isn't useful, output the original answer again.\n"
            "Original Answer: {existing_answer}"
        ),
    ),
 ]
 refine_template = ChatPromptTemplate(chat_refine_msgs)
 chat_engine = index.as_chat_engine(
    text_qa_template=text_qa_template,
    refine_template=refine_template,
    chat_mode=ChatMode.OPENAI
 )
 while True:
    try:
        chat_engine.chat_repl()
    except KeyboardInterrupt:
        break
    while True:
        try:
            chat_engine.chat_repl()
        except KeyboardInterrupt:
            break
--- a/requirements.txt
+++ b/requirements.txt
@ -1,16 +0,0 @@
 # =====================
 # Required dependencies
 # =====================
 # general deps
 requests~=2.31.0
 llama-index~=0.8.40
 beautifulsoup4~=4.12.2
 python-dotenv~=1.0.0
 # llama sub deps
 transformers~=4.34.0
 torch~=2.1.0
 # =====================
 # Development dependencies
 # =====================