Better prompt

Include episode title and number into data
2023-10-06 22:43:39 +02:00 · 2023-10-06 22:43:28 +02:00
2 changed files with 15 additions and 3 deletions
--- a/download_transcripts.py
+++ b/download_transcripts.py
@@ -6,8 +6,14 @@ for i in range(1, 139):
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    pre_section = soup.find('pre')
+    title_section = soup.find('h1')

    if pre_section:
        text = pre_section.get_text()
+        title = title_section.get_text()
        with open(f"data/episode_{i}.txt", "w") as f:
-            f.write(text)
+            f.write(
+                f"Darknet Diaries - {title}\n" +
+                text
+            )
+        print(title)
--- a/main.py
+++ b/main.py
@@ -11,6 +11,7 @@ service_context = ServiceContext.from_defaults(llm=llm)
 set_global_service_context(service_context)

 if not os.path.exists("./index/lock"):
+    print("Generating index...")
    documents = []
    for filename in os.listdir("./data"):
        episode_number = re.search(r'\d+', filename).group()
@@ -31,12 +32,17 @@ if not os.path.exists("./index/lock"):
    index.storage_context.persist(persist_dir="./index")
    open("./index/lock", 'a').close()
 else:
+    print("Loading index...")
    storage_context = StorageContext.from_defaults(persist_dir="./index")
    index = load_index_from_storage(storage_context)

 template = (
-    "You are now an expert on the Darknet Diaries podcast. \n"
-    "Please answer this question by referring to the podcast: {query_str}\n"
+    "You have been trained on the Darknet Diaries podcast transcripts with data from october 6 2023."
+    "You are now an expert about it and will answer as such. You know about every episode up to number 138. \n"
+    "----------------\n"
+    "Here is the context: {context_str}"
+    "----------------\n"
+    "Please answer this question by referring to the podcast: {query_str}"
 )
 qa_template = PromptTemplate(template)
 query_engine = index.as_query_engine(text_qa_template=qa_template)
Author	SHA1	Message	Date
Romain Quinet	a77d41c6ec	Better prompt	2023-10-06 22:43:39 +02:00
Romain Quinet	96c692aef7	Include episode title and number into data	2023-10-06 22:43:28 +02:00