darknet_diaries_llm/main.py

from llama_index import (SimpleDirectoryReader, ServiceContext, StorageContext, PromptTemplate,
                         load_index_from_storage, Document, set_global_service_context)
from llama_index.node_parser import SimpleNodeParser
from llama_index import VectorStoreIndex
from llama_index.llms import OpenAI, ChatMessage, MessageRole
from llama_index.prompts import ChatPromptTemplate
import os
import re

llm = OpenAI(model="gpt-4", temperature=0, max_tokens=256)
service_context = ServiceContext.from_defaults(llm=llm)
set_global_service_context(service_context)

if not os.path.exists("./index/lock"):
    print("Generating index...")
    documents = []
    for filename in os.listdir("./data"):
        episode_number = re.search(r'\d+', filename).group()
        with open("./data/" + filename, 'r') as f:
            content = f.read()
        document = Document(
            text=content,
            metadata={
                "episode_number": episode_number
            }
        )

    documents = SimpleDirectoryReader('./data').load_data()
    parser = SimpleNodeParser.from_defaults()
    nodes = parser.get_nodes_from_documents(documents)

    index = VectorStoreIndex(nodes, show_progress=True)
    index.storage_context.persist(persist_dir="./index")
    open("./index/lock", 'a').close()
else:
    print("Loading index...")
    storage_context = StorageContext.from_defaults(persist_dir="./index")
    index = load_index_from_storage(storage_context)

custom_prompt = PromptTemplate(
    "----------------\n"
    "Chat history: {chat_history}\n"
    "----------------\n"
    "Please answer this question by referring to the podcast: {question}"
)

chat_text_qa_msgs = [
    ChatMessage(
        role=MessageRole.SYSTEM,
        content=(
            "You have been trained on the Darknet Diaries podcast transcripts with data from october 6 2023."
            "You are an expert about it and will answer as such. You know about every episode up to number 138."
            "Always answer the question, even if the context isn't helpful."
        )
    ),
    ChatMessage(
        role=MessageRole.USER,
        content=(
            "Context information is below.\n"
            "---------------------\n"
            "{context_str}\n"
            "---------------------\n"
            "Given the context information and not prior knowledge,"
            "answer the question: {query_str}\n"
        )
    )
]
text_qa_template = ChatPromptTemplate(chat_text_qa_msgs)

chat_refine_msgs = [
    ChatMessage(
        role=MessageRole.SYSTEM,
        content="Always answer the question, even if the context isn't helpful.",
    ),
    ChatMessage(
        role=MessageRole.USER,
        content=(
            "We have the opportunity to refine the original answer "
            "(only if needed) with some more context below.\n"
            "------------\n"
            "{context_msg}\n"
            "------------\n"
            "Given the new context, refine the original answer to better "
            "answer the question: {query_str}. "
            "If the context isn't useful, output the original answer again.\n"
            "Original Answer: {existing_answer}"
        ),
    ),
]
refine_template = ChatPromptTemplate(chat_refine_msgs)

chat_engine = index.as_chat_engine(
    text_qa_template=text_qa_template,
    refine_template=refine_template
)

while True:
    try:
        user_prompt = input("Prompt: ")
        streaming_response = chat_engine.stream_chat(user_prompt)
        for token in streaming_response.response_gen:
            print(token, end="")
        print("\n")
    except KeyboardInterrupt:
        break
initial commit 2023-10-06 21:35:53 +02:00			`from llama_index import (SimpleDirectoryReader, ServiceContext, StorageContext, PromptTemplate,`
			`load_index_from_storage, Document, set_global_service_context)`
			`from llama_index.node_parser import SimpleNodeParser`
			`from llama_index import VectorStoreIndex`
Chat mode 2023-10-06 23:22:10 +02:00			`from llama_index.llms import OpenAI, ChatMessage, MessageRole`
Better prompts 2023-10-06 23:45:47 +02:00			`from llama_index.prompts import ChatPromptTemplate`
initial commit 2023-10-06 21:35:53 +02:00			`import os`
			`import re`

Use GPT4 2023-10-06 22:52:42 +02:00			`llm = OpenAI(model="gpt-4", temperature=0, max_tokens=256)`
initial commit 2023-10-06 21:35:53 +02:00			`service_context = ServiceContext.from_defaults(llm=llm)`
			`set_global_service_context(service_context)`

			`if not os.path.exists("./index/lock"):`
Better prompt 2023-10-06 22:43:39 +02:00			`print("Generating index...")`
initial commit 2023-10-06 21:35:53 +02:00			`documents = []`
			`for filename in os.listdir("./data"):`
			`episode_number = re.search(r'\d+', filename).group()`
			`with open("./data/" + filename, 'r') as f:`
			`content = f.read()`
			`document = Document(`
			`text=content,`
			`metadata={`
			`"episode_number": episode_number`
			`}`
			`)`

			`documents = SimpleDirectoryReader('./data').load_data()`
			`parser = SimpleNodeParser.from_defaults()`
			`nodes = parser.get_nodes_from_documents(documents)`

			`index = VectorStoreIndex(nodes, show_progress=True)`
			`index.storage_context.persist(persist_dir="./index")`
			`open("./index/lock", 'a').close()`
			`else:`
Better prompt 2023-10-06 22:43:39 +02:00			`print("Loading index...")`
initial commit 2023-10-06 21:35:53 +02:00			`storage_context = StorageContext.from_defaults(persist_dir="./index")`
			`index = load_index_from_storage(storage_context)`

Chat mode 2023-10-06 23:22:10 +02:00			`custom_prompt = PromptTemplate(`
Better prompt 2023-10-06 22:43:39 +02:00			`"----------------\n"`
Chat mode 2023-10-06 23:22:10 +02:00			`"Chat history: {chat_history}\n"`
Better prompt 2023-10-06 22:43:39 +02:00			`"----------------\n"`
Chat mode 2023-10-06 23:22:10 +02:00			`"Please answer this question by referring to the podcast: {question}"`
initial commit 2023-10-06 21:35:53 +02:00			`)`
Chat mode 2023-10-06 23:22:10 +02:00
Better prompts 2023-10-06 23:45:47 +02:00			`chat_text_qa_msgs = [`
			`ChatMessage(`
			`role=MessageRole.SYSTEM,`
			`content=(`
			`"You have been trained on the Darknet Diaries podcast transcripts with data from october 6 2023."`
			`"You are an expert about it and will answer as such. You know about every episode up to number 138."`
			`"Always answer the question, even if the context isn't helpful."`
			`)`
			`),`
			`ChatMessage(`
			`role=MessageRole.USER,`
			`content=(`
			`"Context information is below.\n"`
			`"---------------------\n"`
			`"{context_str}\n"`
			`"---------------------\n"`
			`"Given the context information and not prior knowledge,"`
			`"answer the question: {query_str}\n"`
			`)`
			`)`
			`]`
			`text_qa_template = ChatPromptTemplate(chat_text_qa_msgs)`
Chat mode 2023-10-06 23:22:10 +02:00
Better prompts 2023-10-06 23:45:47 +02:00			`chat_refine_msgs = [`
			`ChatMessage(`
			`role=MessageRole.SYSTEM,`
			`content="Always answer the question, even if the context isn't helpful.",`
			`),`
			`ChatMessage(`
			`role=MessageRole.USER,`
			`content=(`
			`"We have the opportunity to refine the original answer "`
			`"(only if needed) with some more context below.\n"`
			`"------------\n"`
			`"{context_msg}\n"`
			`"------------\n"`
			`"Given the new context, refine the original answer to better "`
			`"answer the question: {query_str}. "`
			`"If the context isn't useful, output the original answer again.\n"`
			`"Original Answer: {existing_answer}"`
			`),`
			`),`
			`]`
			`refine_template = ChatPromptTemplate(chat_refine_msgs)`

			`chat_engine = index.as_chat_engine(`
			`text_qa_template=text_qa_template,`
			`refine_template=refine_template`
			`)`
initial commit 2023-10-06 21:35:53 +02:00
			`while True:`
			`try:`
			`user_prompt = input("Prompt: ")`
Chat mode 2023-10-06 23:22:10 +02:00			`streaming_response = chat_engine.stream_chat(user_prompt)`
			`for token in streaming_response.response_gen:`
			`print(token, end="")`
			`print("\n")`
initial commit 2023-10-06 21:35:53 +02:00			`except KeyboardInterrupt:`
			`break`