deps-managment-and-dotenv #1

Open
EndMove wants to merge 3 commits from EndMove/darknet_diaries_llm:deps-managment-and-dotenv into master
4 changed files with 124 additions and 104 deletions
Showing only changes of commit 6d7600e47e - Show all commits

1
.gitignore vendored
View File

@ -1,3 +1,4 @@
.env
/transcripts
/index
/.idea

View File

@ -5,26 +5,30 @@ import json
folder_path = "transcripts"
if not os.path.exists(folder_path):
os.makedirs(folder_path)
if __name__ == '__main__':
Review

c'est vraiment nécessaire d'avoir ça dans un fichier qui n'est pas importé par d'autres fichiers? C'est juste un script pas un module

c'est vraiment nécessaire d'avoir ça dans un fichier qui n'est pas importé par d'autres fichiers? C'est juste un script pas un module
Review

yup pour spécifier que c'est un script "executable"

yup pour spécifier que c'est un script "executable"
if not os.path.exists(folder_path):
os.makedirs(folder_path)
for i in range(1, 139):
try:
url = f"https://darknetdiaries.com/transcript/{i}"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
for i in range(1, 139):
try:
# fetch transcript
url = f"https://darknetdiaries.com/transcript/{i}"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
transcript = soup.find('pre').get_text()
transcript = soup.find('pre').get_text()
url = f"https://api.darknetdiaries.com/{i}.json"
r = requests.get(url)
parsed_json = json.loads(r.text)
title = parsed_json["episode_name"]
number = parsed_json["episode_number"]
downloads = parsed_json["total_downloads"]
# fetch transcript metadata
url = f"https://api.darknetdiaries.com/{i}.json"
r = requests.get(url)
parsed_json = json.loads(r.text)
title = parsed_json["episode_name"]
number = parsed_json["episode_number"]
downloads = parsed_json["total_downloads"]
with open(f"{folder_path}/episode_{number}.txt", "w") as f:
f.write(f"{title}\n{downloads}\n{transcript}")
print(f"{number} {title}")
except Exception:
print(f"Failed scraping episode {i}")
# write transcript
with open(f"{folder_path}/episode_{number}.txt", "w", encoding='utf-8') as f:
f.write(f"{title}\n{downloads}\n{transcript}")
print(f"{number} {title}")
except Exception as err:
print(f"Failed scraping episode {i} : [{err}]")

174
main.py
View File

@ -6,100 +6,108 @@ from llama_index.llms import OpenAI, ChatMessage, MessageRole
from llama_index.prompts import ChatPromptTemplate
# from llama_index import set_global_handler
from llama_index.chat_engine.types import ChatMode
from dotenv import load_dotenv
import os
import re
# set_global_handler("simple")
llm = OpenAI(model="gpt-4", temperature=0, max_tokens=256)
# load .env
load_dotenv()
OPEN_API_KEY = os.getenv('OPEN_API_KEY')
Review

pas nécessaire, llama-index prend déjà la clé API depuis les envvar

pas nécessaire, llama-index prend déjà la clé API depuis les envvar
Review

pour ceux qui ne passent pas par la :p genre moi ^^

pour ceux qui ne passent pas par la :p genre moi ^^
# config llm context
llm = OpenAI(model="gpt-4", temperature=0, max_tokens=256, api_key="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")
Outdated
Review

merci pour la clé gratuite xD

merci pour la clé gratuite xD

oupss ;')

oupss ;')
service_context = ServiceContext.from_defaults(llm=llm)
set_global_service_context(service_context)
if not os.path.exists("./index/lock"):
documents = []
for filename in os.listdir("./transcripts"):
episode_number = re.search(r'\d+', filename).group()
with open("./transcripts/" + filename, 'r') as f:
title = f.readline().strip()
downloads = f.readline().strip()
content = f.read()
document = Document(
text=content,
doc_id=filename,
metadata={
"episode_number": episode_number,
"episode_title": title,
"episode_downloads": downloads,
"episode_url": f"https://darknetdiaries.com/episode/{episode_number}/"
}
)
documents.append(document)
# TODO split in small functions
Review

no

no
if __name__ == '__main__':
if not os.path.exists("./index/lock"):
documents = []
for filename in os.listdir("./transcripts"):
episode_number = re.search(r'\d+', filename).group()
with open("./transcripts/" + filename, 'r') as f:
title = f.readline().strip()
downloads = f.readline().strip()
content = f.read()
document = Document(
text=content,
doc_id=filename,
metadata={
"episode_number": episode_number,
"episode_title": title,
"episode_downloads": downloads,
"episode_url": f"https://darknetdiaries.com/episode/{episode_number}/"
}
)
documents.append(document)
parser = SimpleNodeParser.from_defaults()
nodes = parser.get_nodes_from_documents(documents)
parser = SimpleNodeParser.from_defaults()
nodes = parser.get_nodes_from_documents(documents)
index = VectorStoreIndex(nodes, show_progress=True)
index.storage_context.persist(persist_dir="./index")
open("./index/lock", 'a').close()
else:
print("Loading index...")
storage_context = StorageContext.from_defaults(persist_dir="./index")
index = load_index_from_storage(storage_context)
index = VectorStoreIndex(nodes, show_progress=True)
index.storage_context.persist(persist_dir="./index")
open("./index/lock", 'a').close()
else:
print("Loading index...")
storage_context = StorageContext.from_defaults(persist_dir="./index")
index = load_index_from_storage(storage_context)
chat_text_qa_msgs = [
ChatMessage(
role=MessageRole.SYSTEM,
content=(
"You have been trained on the Darknet Diaries podcast transcripts with data from october 6 2023."
"You are an expert about it and will answer as such. You know about every episode up to number 138."
"Always answer the question, even if the context isn't helpful."
"Mention the number and title of the episodes you are referring to."
)
),
ChatMessage(
role=MessageRole.USER,
content=(
"Context information is below.\n"
"---------------------\n"
"{context_str}\n"
"---------------------\n"
"Given the context information and not prior knowledge,"
"answer the question: {query_str}\n"
)
)
]
text_qa_template = ChatPromptTemplate(chat_text_qa_msgs)
chat_refine_msgs = [
ChatMessage(
role=MessageRole.SYSTEM,
content="Always answer the question, even if the context isn't helpful.",
),
ChatMessage(
role=MessageRole.USER,
content=(
"We have the opportunity to refine the original answer "
"(only if needed) with some more context below.\n"
"------------\n"
"{context_msg}\n"
"------------\n"
"Given the new context, refine the original answer to better "
"answer the question: {query_str}. "
"If the context isn't useful, output the original answer again.\n"
"Original Answer: {existing_answer}"
chat_text_qa_msgs = [
ChatMessage(
role=MessageRole.SYSTEM,
content=(
"You have been trained on the Darknet Diaries podcast transcripts with data from october 6 2023."
"You are an expert about it and will answer as such. You know about every episode up to number 138."
"Always answer the question, even if the context isn't helpful."
"Mention the number and title of the episodes you are referring to."
)
),
),
]
refine_template = ChatPromptTemplate(chat_refine_msgs)
ChatMessage(
role=MessageRole.USER,
content=(
"Context information is below.\n"
"---------------------\n"
"{context_str}\n"
"---------------------\n"
"Given the context information and not prior knowledge,"
"answer the question: {query_str}\n"
)
)
]
text_qa_template = ChatPromptTemplate(chat_text_qa_msgs)
chat_engine = index.as_chat_engine(
text_qa_template=text_qa_template,
refine_template=refine_template,
chat_mode=ChatMode.OPENAI
)
chat_refine_msgs = [
ChatMessage(
role=MessageRole.SYSTEM,
content="Always answer the question, even if the context isn't helpful.",
),
ChatMessage(
role=MessageRole.USER,
content=(
"We have the opportunity to refine the original answer "
"(only if needed) with some more context below.\n"
"------------\n"
"{context_msg}\n"
"------------\n"
"Given the new context, refine the original answer to better "
"answer the question: {query_str}. "
"If the context isn't useful, output the original answer again.\n"
"Original Answer: {existing_answer}"
),
),
]
refine_template = ChatPromptTemplate(chat_refine_msgs)
while True:
try:
chat_engine.chat_repl()
except KeyboardInterrupt:
break
chat_engine = index.as_chat_engine(
text_qa_template=text_qa_template,
refine_template=refine_template,
chat_mode=ChatMode.OPENAI
)
while True:
try:
chat_engine.chat_repl()
except KeyboardInterrupt:
break

View File

@ -1,8 +1,15 @@
# =====================
# Required dependencies
# =====================
llama-index==0.8.*
beautifulsoup4==4.12.*
# general deps
requests~=2.31.0
llama-index~=0.8.40
beautifulsoup4~=4.12.2
python-dotenv~=1.0.0
# llama sub deps
transformers~=4.34.0
torch~=2.1.0
# =====================
# Development dependencies