app3.py

import os
import tempfile
import requests
import streamlit as st
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
from langchain.memory import ConversationBufferMemory
from langchain.memory.chat_message_histories import StreamlitChatMessageHistory
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.callbacks.base import BaseCallbackHandler
from langchain.chains import ConversationalRetrievalChain
from langchain.vectorstores import DocArrayInMemorySearch
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma, FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
from io import BytesIO

st.set_page_config(page_title="LangChain: Chat with Eric the E-Teacher", page_icon="🤓")
st.title("🤓 Chat with Eric the Econs E-Teacher about Economics!")

os.environ['OPENAI_API_KEY'] = st.secrets['OPENAI_API_KEY']

@st.cache_resource(ttl="1h")
def configure_retriever():
    
    owner = 'jemyap91'
    repo = 'LLM-for-good'
    # file_paths = ['2022 H2 Market Failure Lecture Notes_final.pdf',\
    #               '2022 H2 DDSS Lecture Notes_final_updated.pdf',\
    #               '2022 H2 SOL Lecture Notes_final.pdf',\
    #               '2022 H1H2 CPE Lecture Notes_final.pdf',\
    #               '2022 JC 1 H2 Firms Decisions and Strategies Lecture Notes_final (1).pdf',\
    #                 '2023 H1 and H2 The Singapore Economy_FINAL.pdf',\
    #                     '2023 H2 Interconnectedness edited_FINAL.pdf',\
    #                         '2023 H2 International Economics Lecture Notes_FINAL.pdf',\
    #                             '2023 H2 Macroeconomic Issues  and Policies (BOP)_FINAL.pdf',\
    #                                'MLC_2023 H2 intro to Macro Analysis Lecture Notes_FINAL.pdf',\
    #                                 'MLC_2023 H2 Macro Issues and Policies Inflation Lecture Notes_FINALdocx.pdf',\
    #                                     'MLC_2023 H2 Macroeconomic issues Growth _ Macro Policies Lecture Notes_FINAL.pdf',\
    #                                         'MLC_2023 H2 Macroeconomic issues UnE _ Macro Policies Lecture Notes_FINAL.pdf']
    
    # Read documents
    # docs = []
    # temp_dir = tempfile.TemporaryDirectory()
    # base_url = f"https://raw.githubusercontent.com/{owner}/{repo}/main/documents/"
    
    # for file_path in file_paths:
    #     file_url = base_url + file_path
    #     response = requests.get(file_url)
    #     temp_filepath = os.path.join(temp_dir.name, file_path)

    #     with open(temp_filepath, "wb") as f:
    #         f.write(response.content)

    #     loader = PyPDFLoader(temp_filepath)
    #     docs.extend(loader.load())


    # Split documents
    # text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=150)
    # splits = text_splitter.split_documents(docs)

    # Create embeddings and store in vectordb
    # embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    # vectordb = DocArrayInMemorySearch.from_documents(splits, embeddings)

    def load_vector_db(db : str = 'FAISS', persist_directory : str = None ):
        # create the open-source embedding function
        embedding_function = OpenAIEmbeddings(model='text-embedding-ada-002',
                                            show_progress_bar=True)
    
        new_db = FAISS.load_local(persist_directory, embedding_function)
        return new_db

    dbdirectory = f"https://raw.githubusercontent.com/{owner}/{repo}/main/faiss/"
    
    vectordb = load_vector_db('FAISS', dbdirectory)

    # Define retriever
    retriever = vectordb.as_retriever(search_type="mmr", search_kwargs={"k": 2, "fetch_k": 4})

    return retriever


class StreamHandler(BaseCallbackHandler):
    def __init__(self, container: st.delta_generator.DeltaGenerator, initial_text: str = ""):
        self.container = container
        self.text = initial_text
        self.run_id_ignore_token = None

    def on_llm_start(self, serialized: dict, prompts: list, **kwargs):
        # Workaround to prevent showing the rephrased question as output
        if prompts[0].startswith("Human"):
            self.run_id_ignore_token = kwargs.get("run_id")

    def on_llm_new_token(self, token: str, **kwargs) -> None:
        if self.run_id_ignore_token == kwargs.get("run_id", False):
            return
        self.text += token
        self.container.markdown(self.text)


class PrintRetrievalHandler(BaseCallbackHandler):
    def __init__(self, container):
        self.status = container.status("**Context Retrieval**")

    def on_retriever_start(self, serialized: dict, query: str, **kwargs):
        self.status.write(f"**Question:** {query}")
        self.status.update(label=f"**Context Retrieval:** {query}")

    def on_retriever_end(self, documents, **kwargs):
        for idx, doc in enumerate(documents):
            source = os.path.basename(doc.metadata["source"])
            self.status.write(f"**Document {idx} from {source}**")
            self.status.markdown(doc.page_content)
        self.status.update(state="complete")


openai_api_key = st.sidebar.text_input("OpenAI API Key", type="password")
if not openai_api_key:
    st.info("Please add your OpenAI API key to continue.")
    st.stop()

# uploaded_files = st.sidebar.file_uploader(
#     label="Upload PDF files", type=["pdf"], accept_multiple_files=True
# )
# if not uploaded_files:
#     st.info("Please upload PDF documents to continue.")
#     st.stop()

retriever = configure_retriever()

# Setup memory for contextual conversation
msgs = StreamlitChatMessageHistory()
memory = ConversationBufferMemory(memory_key="chat_history", chat_memory=msgs, return_messages=True)

# Setup LLM and QA chain
llm = ChatOpenAI(
    model_name="gpt-3.5-turbo", openai_api_key=openai_api_key, temperature=0, streaming=True
)
qa_chain = ConversationalRetrievalChain.from_llm(
    llm, retriever=retriever, memory=memory, verbose=True
)

if len(msgs.messages) == 0 or st.sidebar.button("Clear message history"):
    msgs.clear()
    msgs.add_ai_message("What do you want to know about economics young padawan?")

avatars = {"human": "user", "ai": "assistant"}
for msg in msgs.messages:
    st.chat_message(avatars[msg.type]).write(msg.content)

if user_query := st.chat_input(placeholder="Ask me stuff!"):
    st.chat_message("user").write(user_query)

    with st.chat_message("assistant"):
        retrieval_handler = PrintRetrievalHandler(st.container())
        stream_handler = StreamHandler(st.empty())
        response = qa_chain.run(user_query, callbacks=[retrieval_handler, stream_handler])