Enterprise Grade RAG in #Microsoft #Fabric using #CosmosDB and #DiskANN
In this video, we’re hardening our RAG architecture to meet the demands of the enterprise! Building on our previous video (https://youtu.be/jwVOQCUUH1Y), we are making our RAG for Microsoft Fabric enterprise grade.
this is the code used in this video:
%pip install langchain %pip install langchain-core %pip install langchain-experimental %pip install langchain_openai %pip install langchain-chroma %pip install langchainhub %pip install PyPDF2 %pip install --upgrade --quiet azure-cosmos langchain-openai langchain-community import os, openai#, langchain, uuid from synapse.ml.core.platform import find_secret openai_key = find_secret(secret_name="YOUROPENAIKEY", keyvault="YOUR_KEYVAULT_NAME") cosmosdb_key = find_secret(secret_name="YOURCOSMOSKEY", keyvault="YOUR_KEYVAULT_NAME") openai_service_name = "YOUR_SERVICE_NAME" openai_endpoint = "https://YOUR_SERVICE_NAME.openai.azure.com/" openai_deployment_for_embeddings = "text-embedding-ada-002" openai_deployment_for_query = "gpt-35-turbo" openai_deployment_for_completions = "davinci-002" #"davinci-002" openai_api_type = "azure" openai_api_version = "2023-12-01-preview" os.environ["OPENAI_API_TYPE"] = openai_api_type os.environ["OPENAI_API_VERSION"] = openai_api_version #os.environ["OPENAI_API_BASE"] = """" os.environ["OPENAI_API_KEY"] = openai_key os.environ["AZURE_OPENAI_ENDPOINT"] = openai_endpoint base_path = "/lakehouse/default/Files/YOURFOLDER/" del os.environ['OPENAI_API_BASE'] import bs4 from langchain import hub from langchain_chroma import Chroma from langchain_community.document_loaders import WebBaseLoader from langchain_core.output_parsers import StrOutputParser from langchain_core.runnables import RunnablePassthrough from langchain_openai import OpenAIEmbeddings #from langchain_openai import AzureOpenAIEmbeddings from langchain.embeddings import AzureOpenAIEmbeddings from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain.llms import AzureOpenAI, OpenAI from langchain_openai import AzureOpenAIEmbeddings from PyPDF2 import PdfReader from langchain.document_loaders import PyPDFLoader from langchain.schema import Document folder_path = base_path def load_pdfs_from_folder(folder_path): documents = [] for filename in os.listdir(folder_path): if filename.endswith('.pdf'): file_path = os.path.join(folder_path, filename) reader = PdfReader(file_path) text = "" for page in reader.pages: text += page.extract_text() document = Document(page_content=text, metadata={"document_name": filename}) documents.append(document) return documents # Load documents documents = load_pdfs_from_folder(folder_path) # Print the content of each document for doc in documents: print(f"Document Name: {doc.metadata['document_name']}") #print(doc.page_content) print("\n---\n") indexing_policy = { "indexingMode": "consistent", "includedPaths": [{"path": "/*"}], "excludedPaths": [{"path": '/"_etag"/?'}], "vectorIndexes": [{"path": "/embedding", "type": "diskANN"}], } vector_embedding_policy = { "vectorEmbeddings": [ { "path": "/embedding", "dataType": "float32", "distanceFunction": "cosine", "dimensions": 1536, } ] } from azure.cosmos import CosmosClient, PartitionKey from langchain_community.vectorstores.azure_cosmos_db_no_sql import ( AzureCosmosDBNoSqlVectorSearch, ) from langchain_openai import AzureOpenAIEmbeddings HOST = "https://YOURCOSMOSDB.documents.azure.com:443/" KEY = cosmosdb_key cosmos_client = CosmosClient(HOST, KEY) database_name = "YOURCOSMOSDBNAME" container_name = "YOURCONTAINER" partition_key = PartitionKey(path="/id") cosmos_container_properties = {"partition_key": partition_key} cosmos_database_properties = {"id": database_name} openai_embeddings = AzureOpenAIEmbeddings() text_splitter = RecursiveCharacterTextSplitter(chunk_size=4000, chunk_overlap=200) splits = text_splitter.split_documents(documents) # insert the documents in AzureCosmosDBNoSql with their embedding vector_search = AzureCosmosDBNoSqlVectorSearch.from_documents( documents=splits, embedding=openai_embeddings, cosmos_client=cosmos_client, database_name=database_name, container_name=container_name, vector_embedding_policy=vector_embedding_policy, indexing_policy=indexing_policy, cosmos_container_properties=cosmos_container_properties, cosmos_database_properties=cosmos_database_properties, ) from langchain.schema import HumanMessage import openai display(answers[0].page_content) from langchain_openai import AzureChatOpenAI from langchain.schema import HumanMessage import openai llm = AzureChatOpenAI(azure_deployment=openai_deployment_for_query) retriever = vector_search.as_retriever() prompt = hub.pull("rlm/rag-prompt") message = HumanMessage( content="Tell me what you know about Prohabits." ) result = llm.invoke([message]) def format_docs(docs): return "\n\n".join(doc.page_content for doc in docs) rag_chain = ( {"context": retriever | format_docs, "question": RunnablePassthrough()} | prompt | llm | StrOutputParser() ) rag_chain.invoke("What is Prohabits?")