diff --git a/RAG/examples/basic_rag/llamaindex/docker-compose.yaml b/RAG/examples/basic_rag/llamaindex/docker-compose.yaml index 520f70370..b7bfccc4f 100644 --- a/RAG/examples/basic_rag/llamaindex/docker-compose.yaml +++ b/RAG/examples/basic_rag/llamaindex/docker-compose.yaml @@ -25,19 +25,18 @@ services: APP_VECTORSTORE_URL: "http://milvus:19530" # Type of vectordb used to store embedding supported type milvus, pgvector APP_VECTORSTORE_NAME: "milvus" - APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-nvidia/nv-embedqa-e5-v5} + APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-openai-main/text-embedding-ada-002} # embedding model engine used for inference, supported type nvidia-ai-endpoints, huggingface - APP_EMBEDDINGS_MODELENGINE: ${APP_EMBEDDINGS_MODELENGINE:-nvidia-ai-endpoints} + APP_EMBEDDINGS_MODELENGINE: ${APP_EMBEDDINGS_MODELENGINE:-openai} # url on which embedding model is hosted. If "", Nvidia hosted API is used - APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL:-""} + APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL:-"https://internal.devtest.truefoundry.tech/api/llm/api/inference/openai"} # url on which llm model is hosted. If "", Nvidia hosted API is used - APP_LLM_SERVERURL: ${APP_LLM_SERVERURL:-""} - APP_LLM_MODELNAME: ${APP_LLM_MODELNAME:-"meta/llama3-8b-instruct"} + APP_LLM_SERVERURL: ${APP_LLM_SERVERURL:-"https://internal.devtest.truefoundry.tech/api/llm/api/inference/openai"} + APP_LLM_MODELNAME: ${APP_LLM_MODELNAME:-"openai-main/gpt-4o"} # embedding model engine used for inference, supported type nvidia-ai-endpoints - APP_LLM_MODELENGINE: ${APP_LLM_MODELENGINE:-nvidia-ai-endpoints} - NVIDIA_API_KEY: ${NVIDIA_API_KEY} + APP_LLM_MODELENGINE: ${APP_LLM_MODELENGINE:-openai} # vectorstore collection name to store embeddings - COLLECTION_NAME: ${COLLECTION_NAME:-developer_rag} + COLLECTION_NAME: ${COLLECTION_NAME:-openai-rag} APP_RETRIEVER_TOPK: 4 APP_RETRIEVER_SCORETHRESHOLD: 0.25 # observability server url @@ -81,7 +80,7 @@ services: APP_SERVERURL: http://chain-server APP_SERVERPORT: 8081 # model name displayed on UI - APP_MODELNAME: ${APP_LLM_MODELNAME:-"meta/llama3-8b-instruct"} + APP_MODELNAME: ${APP_LLM_MODELNAME:-"openai-main/gpt-4o"} # observability server url OTEL_EXPORTER_OTLP_ENDPOINT: http://otel-collector:4317 OTEL_EXPORTER_OTLP_PROTOCOL: grpc @@ -96,4 +95,4 @@ services: networks: default: - name: nvidia-rag + name: openai-rag diff --git a/RAG/examples/local_deploy/docker-compose-nim-ms.yaml b/RAG/examples/local_deploy/docker-compose-nim-ms.yaml index 6c73fff02..ecbf1028d 100644 --- a/RAG/examples/local_deploy/docker-compose-nim-ms.yaml +++ b/RAG/examples/local_deploy/docker-compose-nim-ms.yaml @@ -10,7 +10,7 @@ services: expose: - "8000" environment: - NGC_API_KEY: ${NGC_API_KEY} + NGC_API_KEY: ${NGC_API_KEY:-1234567890} shm_size: 20gb deploy: resources: @@ -37,7 +37,7 @@ services: expose: - "8000" environment: - NGC_API_KEY: ${NGC_API_KEY} + NGC_API_KEY: ${NGC_API_KEY:-1234567890} user: "${USERID}" shm_size: 16GB deploy: @@ -65,22 +65,22 @@ services: expose: - "8000" environment: - NGC_API_KEY: ${NGC_API_KEY} + NGC_API_KEY: ${NGC_API_KEY:-1234567890} user: "${USERID}" healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8000/health"] interval: 10s timeout: 20s retries: 100 - deploy: - resources: - reservations: - devices: - - driver: nvidia - device_ids: ['${RANKING_MS_GPU_ID:-0}'] - capabilities: [gpu] + # deploy: + # resources: + # reservations: + # devices: + # - driver: nvidia + # device_ids: ['${RANKING_MS_GPU_ID:-0}'] + # capabilities: [gpu] profiles: ["nemo-retriever"] networks: default: - name: nvidia-rag + name: openai-rag diff --git a/RAG/examples/local_deploy/docker-compose-vectordb.yaml b/RAG/examples/local_deploy/docker-compose-vectordb.yaml index 88ae5ee82..a003d4109 100644 --- a/RAG/examples/local_deploy/docker-compose-vectordb.yaml +++ b/RAG/examples/local_deploy/docker-compose-vectordb.yaml @@ -74,13 +74,13 @@ services: depends_on: - "etcd" - "minio" - deploy: - resources: - reservations: - devices: - - driver: nvidia - capabilities: ["gpu"] - device_ids: ['${VECTORSTORE_GPU_DEVICE_ID:-0}'] + # deploy: + # resources: + # reservations: + # devices: + # - driver: nvidia + # capabilities: ["gpu"] + # device_ids: ['${VECTORSTORE_GPU_DEVICE_ID:-0}'] profiles: ["nemo-retriever", "milvus", ""] elasticsearch: @@ -119,4 +119,4 @@ services: networks: default: - name: nvidia-rag + name: openai-rag diff --git a/RAG/src/chain_server/configuration.py b/RAG/src/chain_server/configuration.py index ba0dde217..07a13b1d0 100644 --- a/RAG/src/chain_server/configuration.py +++ b/RAG/src/chain_server/configuration.py @@ -14,8 +14,12 @@ # limitations under the License. """The definition of the application configuration.""" -from RAG.src.chain_server.configuration_wizard import ConfigWizard, configclass, configfield - +from RAG.src.chain_server.configuration_wizard import ( + ConfigWizard, + configclass, + configfield, +) +import os @configclass class VectorStoreConfig(ConfigWizard): @@ -61,13 +65,18 @@ class LLMConfig(ConfigWizard): model_engine: str = configfield( "model_engine", default="nvidia-ai-endpoints", - help_txt="The server type of the hosted model. Allowed values are nvidia-ai-endpoints", + help_txt="The server type of the hosted model. Allowed values are nvidia-ai-endpoints,openai", ) model_name_pandas_ai: str = configfield( "model_name_pandas_ai", default="ai-mixtral-8x7b-instruct", help_txt="The name of the ai catalog model to be used with PandasAI agent", ) + api_key: str = configfield( + "api_key", + default=os.getenv("TFY_API_KEY"), + help_txt="API KEY", + ) @configclass @@ -104,16 +113,21 @@ class EmbeddingConfig(ConfigWizard): model_engine: str = configfield( "model_engine", default="nvidia-ai-endpoints", - help_txt="The server type of the hosted model. Allowed values are hugginface", + help_txt="The server type of the hosted model. Allowed values are hugginface,openai", ) dimensions: int = configfield( "dimensions", - default=1024, + default=1536, help_txt="The required dimensions of the embedding model. Currently utilized for vector DB indexing.", ) server_url: str = configfield( "server_url", default="", help_txt="The url of the server hosting nemo embedding model", ) + api_key: str = configfield( + "api_key", + default=os.getenv("TFY_API_KEY"), + help_txt="API KEY", + ) @configclass @@ -129,11 +143,16 @@ class RankingConfig(ConfigWizard): model_engine: str = configfield( "model_engine", default="nvidia-ai-endpoints", - help_txt="The server type of the hosted model. Allowed values are nvidia-ai-endpoints", + help_txt="The server type of the hosted model. Allowed values are nvidia-ai-endpoints,openai", ) server_url: str = configfield( "server_url", default="", help_txt="The url of the server hosting nemo Ranking model", ) + api_key: str = configfield( + "api_key", + default=os.getenv("TFY_API_KEY"), + help_txt="API KEY", + ) @configclass diff --git a/RAG/src/chain_server/requirements.txt b/RAG/src/chain_server/requirements.txt index 217b864ae..ef3096a96 100644 --- a/RAG/src/chain_server/requirements.txt +++ b/RAG/src/chain_server/requirements.txt @@ -10,6 +10,8 @@ llama-index-llms-langchain==0.1.3 llama-index-embeddings-langchain==0.1.2 llama-index-vector-stores-milvus==0.1.6 llama-index-vector-stores-postgres==0.1.5 +llama-index-llms-openai-like +langchain-openai>=0.0.2 pymilvus==2.4.0 dataclass-wizard==0.22.3 opencv-python==4.8.0.74 diff --git a/RAG/src/chain_server/utils.py b/RAG/src/chain_server/utils.py index 0a7ee247d..3f1bebcdb 100644 --- a/RAG/src/chain_server/utils.py +++ b/RAG/src/chain_server/utils.py @@ -22,6 +22,11 @@ from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional from urllib.parse import urlparse +import httpx + +SSL_VERIFY = os.getenv("SSL_VERIFY", "true") + + import yaml logger = logging.getLogger(__name__) @@ -45,7 +50,10 @@ from llama_index.core.indices import VectorStoreIndex from llama_index.core.postprocessor.types import BaseNodePostprocessor from llama_index.core.schema import MetadataMode - from llama_index.core.service_context import ServiceContext, set_global_service_context + from llama_index.core.service_context import ( + ServiceContext, + set_global_service_context, + ) from llama_index.core.utils import get_tokenizer, globals_helper from llama_index.embeddings.langchain import LangchainEmbedding from llama_index.llms.langchain import LangChainLLM @@ -94,6 +102,10 @@ from langchain_core.documents.compressor import BaseDocumentCompressor from langchain_core.embeddings import Embeddings from langchain_core.language_models.chat_models import SimpleChatModel +from langchain_openai import ChatOpenAI, OpenAIEmbeddings +from llama_index.core import Settings +from llama_index.llms.openai_like import OpenAILike +from openai import OpenAI from RAG.src.chain_server import configuration @@ -101,8 +113,6 @@ from RAG.src.chain_server.configuration_wizard import ConfigWizard DEFAULT_MAX_CONTEXT = 1500 - - class LimitRetrievedNodesLength(BaseNodePostprocessor): """Llama Index chain filter to limit token lengths.""" @@ -397,6 +407,26 @@ def get_llm(**kwargs) -> LLM | SimpleChatModel: top_p=kwargs.get('top_p', None), max_tokens=kwargs.get('max_tokens', None), ) + elif settings.llm.model_engine == "openai": + logger.info(f"Using llm model {settings.llm.model_name} from api catalog") + + llm = ChatOpenAI( + model=settings.llm.model_name, + default_headers={ + "X-TFY-METADATA": '{"tfy_log_request":"true"}', + }, + base_url=settings.llm.server_url, + api_key=settings.llm.api_key, + async_client=OpenAI( + api_key=settings.llm.api_key, + base_url=settings.llm.server_url, + http_client=httpx.Client(verify=SSL_VERIFY == "true"), + ), + http_client=httpx.Client(verify=SSL_VERIFY == "true"), + # is_chat_model=True + ) + return llm + else: raise RuntimeError( "Unable to find any supported Large Language Model server. Supported engine name is nvidia-ai-endpoints." @@ -438,6 +468,22 @@ def get_embedding_model() -> Embeddings: else: logger.info(f"Using embedding model {settings.embeddings.model_name} hosted at api catalog") return NVIDIAEmbeddings(model=settings.embeddings.model_name, truncate="END") + elif settings.llm.model_engine == "openai": + embeddings = OpenAIEmbeddings( + base_url=settings.embeddings.server_url, + api_key=settings.embeddings.api_key, + model=settings.embeddings.model_name, + default_headers={ + "X-TFY-METADATA": '{"tfy_log_request":"true"}', + }, + async_client=OpenAI( + api_key=settings.embeddings.api_key, + base_url=settings.embeddings.server_url, + http_client=httpx.Client(verify=SSL_VERIFY == "true"), + ), + http_client=httpx.Client(verify=SSL_VERIFY == "true"), + ) + return embeddings else: raise RuntimeError( "Unable to find any supported embedding model. Supported engine is huggingface and nvidia-ai-endpoints." @@ -459,11 +505,25 @@ def get_ranking_model() -> BaseDocumentCompressor: if settings.ranking.server_url: logger.info(f"Using ranking model hosted at {settings.ranking.server_url}") return NVIDIARerank( - base_url=f"http://{settings.ranking.server_url}/v1", top_n=settings.retriever.top_k, truncate="END" + base_url=f"{settings.ranking.server_url}/v1", + top_n=settings.retriever.top_k, + truncate="END", ) elif settings.ranking.model_name: logger.info(f"Using ranking model {settings.ranking.model_name} hosted at api catalog") return NVIDIARerank(model=settings.ranking.model_name, top_n=settings.retriever.top_k, truncate="END") + elif settings.ranking.model_engine == "openai": + logger.info(f"Using ranking model {settings.ranking.model_name} from api catalog") + llm = OpenAILike( + model=settings.ranking.model_name, + api_base=settings.ranking.server_url, + api_key=settings.ranking.api_key, + default_headers={ + "X-TFY-METADATA": '{"tfy_log_request":"true"}', + }, + http_client=httpx.AsyncClient(verify=SSL_VERIFY == "true"), + ) + return llm else: logger.warning("Unable to find any supported ranking model. Supported engine is nvidia-ai-endpoints.") except Exception as e: diff --git a/RAG/src/rag_playground/requirements.txt b/RAG/src/rag_playground/requirements.txt index e8d22682e..514435c69 100644 --- a/RAG/src/rag_playground/requirements.txt +++ b/RAG/src/rag_playground/requirements.txt @@ -1,6 +1,6 @@ PyYAML==6.0.1 dataclass-wizard==0.22.3 -gradio==4.43.0 +gradio==4.44.1 jinja2==3.1.3 numpy==1.26.4 opentelemetry-api==1.23.0 @@ -8,5 +8,6 @@ opentelemetry-exporter-otlp-proto-grpc==1.23.0 opentelemetry-sdk==1.23.0 protobuf==4.25.3 pycountry==23.12.11 +pydantic==2.10.6 tritonclient[all]==2.43.0 uvicorn==0.27.1 diff --git a/RAG/src/rag_playground/speech/pages/converse.py b/RAG/src/rag_playground/speech/pages/converse.py index a355e0a92..7e7d45475 100644 --- a/RAG/src/rag_playground/speech/pages/converse.py +++ b/RAG/src/rag_playground/speech/pages/converse.py @@ -148,6 +148,7 @@ def build_page(client: chat_client.ChatClient) -> gr.Blocks: show_label=False, container=False, elem_id="microphone", + api_name=False, ) # user feedback