truefoundry · akashg3627 · Apr 17, 2025 · Apr 22, 2025 · Apr 22, 2025 · Apr 22, 2025
diff --git a/RAG/examples/basic_rag/llamaindex/docker-compose.yaml b/RAG/examples/basic_rag/llamaindex/docker-compose.yaml
@@ -25,19 +25,18 @@ services:
       APP_VECTORSTORE_URL: "http://milvus:19530"
       # Type of vectordb used to store embedding supported type milvus, pgvector
       APP_VECTORSTORE_NAME: "milvus"
-      APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-nvidia/nv-embedqa-e5-v5}
+      APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-openai-main/text-embedding-ada-002}
       # embedding model engine used for inference, supported type nvidia-ai-endpoints, huggingface
-      APP_EMBEDDINGS_MODELENGINE: ${APP_EMBEDDINGS_MODELENGINE:-nvidia-ai-endpoints}
+      APP_EMBEDDINGS_MODELENGINE: ${APP_EMBEDDINGS_MODELENGINE:-openai}
       # url on which embedding model is hosted. If "", Nvidia hosted API is used
-      APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL:-""}
+      APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL:-"https://internal.devtest.truefoundry.tech/api/llm/api/inference/openai"}
       # url on which llm model is hosted. If "", Nvidia hosted API is used
-      APP_LLM_SERVERURL: ${APP_LLM_SERVERURL:-""}
-      APP_LLM_MODELNAME: ${APP_LLM_MODELNAME:-"meta/llama3-8b-instruct"}
+      APP_LLM_SERVERURL: ${APP_LLM_SERVERURL:-"https://internal.devtest.truefoundry.tech/api/llm/api/inference/openai"}
+      APP_LLM_MODELNAME: ${APP_LLM_MODELNAME:-"openai-main/gpt-4o"}
       # embedding model engine used for inference, supported type nvidia-ai-endpoints
-      APP_LLM_MODELENGINE: ${APP_LLM_MODELENGINE:-nvidia-ai-endpoints}
-      NVIDIA_API_KEY: ${NVIDIA_API_KEY}
+      APP_LLM_MODELENGINE: ${APP_LLM_MODELENGINE:-openai}
       # vectorstore collection name to store embeddings 
-      COLLECTION_NAME: ${COLLECTION_NAME:-developer_rag}
+      COLLECTION_NAME: ${COLLECTION_NAME:-openai-rag}
       APP_RETRIEVER_TOPK: 4
       APP_RETRIEVER_SCORETHRESHOLD: 0.25
       # observability server url
@@ -81,7 +80,7 @@ services:
       APP_SERVERURL: http://chain-server
       APP_SERVERPORT: 8081
       # model name displayed on UI
-      APP_MODELNAME: ${APP_LLM_MODELNAME:-"meta/llama3-8b-instruct"}
+      APP_MODELNAME: ${APP_LLM_MODELNAME:-"openai-main/gpt-4o"}
       # observability server url
       OTEL_EXPORTER_OTLP_ENDPOINT: http://otel-collector:4317
       OTEL_EXPORTER_OTLP_PROTOCOL: grpc
@@ -96,4 +95,4 @@ services:
 
 networks:
   default:
-    name: nvidia-rag
+    name: openai-rag
diff --git a/RAG/examples/local_deploy/docker-compose-nim-ms.yaml b/RAG/examples/local_deploy/docker-compose-nim-ms.yaml
@@ -10,7 +10,7 @@ services:
     expose:
     - "8000"
     environment:
-      NGC_API_KEY: ${NGC_API_KEY}
+      NGC_API_KEY: ${NGC_API_KEY:-1234567890}
     shm_size: 20gb
     deploy:
       resources:
@@ -37,7 +37,7 @@ services:
     expose:
     - "8000"
     environment:
-      NGC_API_KEY: ${NGC_API_KEY}
+      NGC_API_KEY: ${NGC_API_KEY:-1234567890}
     user: "${USERID}"
     shm_size: 16GB
     deploy:
@@ -65,22 +65,22 @@ services:
     expose:
     - "8000"
     environment:
-      NGC_API_KEY: ${NGC_API_KEY}
+      NGC_API_KEY: ${NGC_API_KEY:-1234567890}
     user: "${USERID}"
     healthcheck:
       test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
       interval: 10s
       timeout: 20s
       retries: 100
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              device_ids: ['${RANKING_MS_GPU_ID:-0}']
-              capabilities: [gpu]
+    # deploy:
+    #   resources:
+    #     reservations:
+    #       devices:
+    #         - driver: nvidia
+    #           device_ids: ['${RANKING_MS_GPU_ID:-0}']
+    #           capabilities: [gpu]
     profiles: ["nemo-retriever"]
 
 networks:
   default:
-    name: nvidia-rag
+    name: openai-rag
diff --git a/RAG/examples/local_deploy/docker-compose-vectordb.yaml b/RAG/examples/local_deploy/docker-compose-vectordb.yaml
@@ -74,13 +74,13 @@ services:
     depends_on:
       - "etcd"
       - "minio"
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              capabilities: ["gpu"]
-              device_ids: ['${VECTORSTORE_GPU_DEVICE_ID:-0}']
+    # deploy:
+    #   resources:
+    #     reservations:
+    #       devices:
+    #         - driver: nvidia
+    #           capabilities: ["gpu"]
+    #           device_ids: ['${VECTORSTORE_GPU_DEVICE_ID:-0}']
     profiles: ["nemo-retriever", "milvus", ""]
 
   elasticsearch:
@@ -119,4 +119,4 @@ services:
 
 networks:
   default:
-    name: nvidia-rag
+    name: openai-rag
diff --git a/RAG/src/chain_server/configuration.py b/RAG/src/chain_server/configuration.py
@@ -14,8 +14,12 @@
 # limitations under the License.
 
 """The definition of the application configuration."""
-from RAG.src.chain_server.configuration_wizard import ConfigWizard, configclass, configfield
-
+from RAG.src.chain_server.configuration_wizard import (
+    ConfigWizard,
+    configclass,
+    configfield,
+)
+import os
 
 @configclass
 class VectorStoreConfig(ConfigWizard):
@@ -61,13 +65,18 @@ class LLMConfig(ConfigWizard):
     model_engine: str = configfield(
         "model_engine",
         default="nvidia-ai-endpoints",
-        help_txt="The server type of the hosted model. Allowed values are nvidia-ai-endpoints",
+        help_txt="The server type of the hosted model. Allowed values are nvidia-ai-endpoints,openai",
     )
     model_name_pandas_ai: str = configfield(
         "model_name_pandas_ai",
         default="ai-mixtral-8x7b-instruct",
         help_txt="The name of the ai catalog model to be used with PandasAI agent",
     )
+    api_key: str = configfield(
+        "api_key",
+        default=os.getenv("TFY_API_KEY"),
+        help_txt="API KEY",
+    )
 
 
 @configclass
@@ -104,16 +113,21 @@ class EmbeddingConfig(ConfigWizard):
     model_engine: str = configfield(
         "model_engine",
         default="nvidia-ai-endpoints",
-        help_txt="The server type of the hosted model. Allowed values are hugginface",
+        help_txt="The server type of the hosted model. Allowed values are hugginface,openai",
     )
     dimensions: int = configfield(
         "dimensions",
-        default=1024,
+        default=1536,
         help_txt="The required dimensions of the embedding model. Currently utilized for vector DB indexing.",
     )
     server_url: str = configfield(
         "server_url", default="", help_txt="The url of the server hosting nemo embedding model",
     )
+    api_key: str = configfield(
+        "api_key",
+        default=os.getenv("TFY_API_KEY"),
+        help_txt="API KEY",
+    )
 
 
 @configclass
@@ -129,11 +143,16 @@ class RankingConfig(ConfigWizard):
     model_engine: str = configfield(
         "model_engine",
         default="nvidia-ai-endpoints",
-        help_txt="The server type of the hosted model. Allowed values are nvidia-ai-endpoints",
+        help_txt="The server type of the hosted model. Allowed values are nvidia-ai-endpoints,openai",
     )
     server_url: str = configfield(
         "server_url", default="", help_txt="The url of the server hosting nemo Ranking model",
     )
+    api_key: str = configfield(
+        "api_key",
+        default=os.getenv("TFY_API_KEY"),
+        help_txt="API KEY",
+    )
 
 
 @configclass

diff --git a/RAG/src/chain_server/requirements.txt b/RAG/src/chain_server/requirements.txt
@@ -10,6 +10,8 @@ llama-index-llms-langchain==0.1.3
 llama-index-embeddings-langchain==0.1.2
 llama-index-vector-stores-milvus==0.1.6
 llama-index-vector-stores-postgres==0.1.5
+llama-index-llms-openai-like
+langchain-openai>=0.0.2
 pymilvus==2.4.0
 dataclass-wizard==0.22.3
 opencv-python==4.8.0.74

diff --git a/RAG/src/chain_server/utils.py b/RAG/src/chain_server/utils.py
@@ -22,6 +22,11 @@
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional
 from urllib.parse import urlparse
 
+import httpx
+
+SSL_VERIFY = os.getenv("SSL_VERIFY", "true")
+
+
 import yaml
 
 logger = logging.getLogger(__name__)
@@ -45,7 +50,10 @@
     from llama_index.core.indices import VectorStoreIndex
     from llama_index.core.postprocessor.types import BaseNodePostprocessor
     from llama_index.core.schema import MetadataMode
-    from llama_index.core.service_context import ServiceContext, set_global_service_context
+    from llama_index.core.service_context import (
+        ServiceContext,
+        set_global_service_context,
+    )
     from llama_index.core.utils import get_tokenizer, globals_helper
     from llama_index.embeddings.langchain import LangchainEmbedding
     from llama_index.llms.langchain import LangChainLLM
@@ -94,15 +102,17 @@
 from langchain_core.documents.compressor import BaseDocumentCompressor
 from langchain_core.embeddings import Embeddings
 from langchain_core.language_models.chat_models import SimpleChatModel
+from langchain_openai import ChatOpenAI, OpenAIEmbeddings
+from llama_index.core import Settings
+from llama_index.llms.openai_like import OpenAILike
+from openai import OpenAI
 
 from RAG.src.chain_server import configuration
 
 if TYPE_CHECKING:
     from RAG.src.chain_server.configuration_wizard import ConfigWizard
 
 DEFAULT_MAX_CONTEXT = 1500
-
-
 class LimitRetrievedNodesLength(BaseNodePostprocessor):
     """Llama Index chain filter to limit token lengths."""
 
@@ -397,6 +407,26 @@ def get_llm(**kwargs) -> LLM | SimpleChatModel:
                 top_p=kwargs.get('top_p', None),
                 max_tokens=kwargs.get('max_tokens', None),
             )
+    elif settings.llm.model_engine == "openai":
+        logger.info(f"Using llm model {settings.llm.model_name} from api catalog")
+
+        llm = ChatOpenAI(
+            model=settings.llm.model_name,
+            default_headers={
+                "X-TFY-METADATA": '{"tfy_log_request":"true"}',
+            },
+            base_url=settings.llm.server_url,
+            api_key=settings.llm.api_key,
+            async_client=OpenAI(
+                api_key=settings.llm.api_key,
+                base_url=settings.llm.server_url,
+                http_client=httpx.Client(verify=SSL_VERIFY == "true"),
+            ),
+            http_client=httpx.Client(verify=SSL_VERIFY == "true"),
+            # is_chat_model=True
+        )
+        return llm
+
     else:
         raise RuntimeError(
             "Unable to find any supported Large Language Model server. Supported engine name is nvidia-ai-endpoints."
@@ -438,6 +468,22 @@ def get_embedding_model() -> Embeddings:
         else:
             logger.info(f"Using embedding model {settings.embeddings.model_name} hosted at api catalog")
             return NVIDIAEmbeddings(model=settings.embeddings.model_name, truncate="END")
+    elif settings.llm.model_engine == "openai":
+        embeddings = OpenAIEmbeddings(
+            base_url=settings.embeddings.server_url,
+            api_key=settings.embeddings.api_key,
+            model=settings.embeddings.model_name,
+            default_headers={
+                "X-TFY-METADATA": '{"tfy_log_request":"true"}',
+            },
+            async_client=OpenAI(
+                api_key=settings.embeddings.api_key,
+                base_url=settings.embeddings.server_url,
+                http_client=httpx.Client(verify=SSL_VERIFY == "true"),
+            ),
+            http_client=httpx.Client(verify=SSL_VERIFY == "true"),
+        )
+        return embeddings
     else:
         raise RuntimeError(
             "Unable to find any supported embedding model. Supported engine is huggingface and nvidia-ai-endpoints."
@@ -459,11 +505,25 @@ def get_ranking_model() -> BaseDocumentCompressor:
             if settings.ranking.server_url:
                 logger.info(f"Using ranking model hosted at {settings.ranking.server_url}")
                 return NVIDIARerank(
-                    base_url=f"http://{settings.ranking.server_url}/v1", top_n=settings.retriever.top_k, truncate="END"
+                    base_url=f"{settings.ranking.server_url}/v1",
+                    top_n=settings.retriever.top_k,
+                    truncate="END",
                 )
             elif settings.ranking.model_name:
                 logger.info(f"Using ranking model {settings.ranking.model_name} hosted at api catalog")
                 return NVIDIARerank(model=settings.ranking.model_name, top_n=settings.retriever.top_k, truncate="END")
+        elif settings.ranking.model_engine == "openai":
+            logger.info(f"Using ranking model {settings.ranking.model_name} from api catalog")
+            llm = OpenAILike(
+                model=settings.ranking.model_name,
+                api_base=settings.ranking.server_url,
+                api_key=settings.ranking.api_key,
+                default_headers={
+                    "X-TFY-METADATA": '{"tfy_log_request":"true"}',
+                },
+                http_client=httpx.AsyncClient(verify=SSL_VERIFY == "true"),
+            )
+            return llm
         else:
             logger.warning("Unable to find any supported ranking model. Supported engine is nvidia-ai-endpoints.")
     except Exception as e:

diff --git a/RAG/src/rag_playground/requirements.txt b/RAG/src/rag_playground/requirements.txt
@@ -1,12 +1,13 @@
 PyYAML==6.0.1
 dataclass-wizard==0.22.3
-gradio==4.43.0
+gradio==4.44.1
 jinja2==3.1.3
 numpy==1.26.4
 opentelemetry-api==1.23.0
 opentelemetry-exporter-otlp-proto-grpc==1.23.0
 opentelemetry-sdk==1.23.0
 protobuf==4.25.3
 pycountry==23.12.11
+pydantic==2.10.6
 tritonclient[all]==2.43.0
 uvicorn==0.27.1
diff --git a/RAG/src/rag_playground/speech/pages/converse.py b/RAG/src/rag_playground/speech/pages/converse.py
@@ -148,6 +148,7 @@ def build_page(client: chat_client.ChatClient) -> gr.Blocks:
                 show_label=False,
                 container=False,
                 elem_id="microphone",
+                api_name=False,
             )
 
         # user feedback