Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 9 additions & 10 deletions RAG/examples/basic_rag/llamaindex/docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,19 +25,18 @@ services:
APP_VECTORSTORE_URL: "http://milvus:19530"
# Type of vectordb used to store embedding supported type milvus, pgvector
APP_VECTORSTORE_NAME: "milvus"
APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-nvidia/nv-embedqa-e5-v5}
APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-openai-main/text-embedding-ada-002}
# embedding model engine used for inference, supported type nvidia-ai-endpoints, huggingface
APP_EMBEDDINGS_MODELENGINE: ${APP_EMBEDDINGS_MODELENGINE:-nvidia-ai-endpoints}
APP_EMBEDDINGS_MODELENGINE: ${APP_EMBEDDINGS_MODELENGINE:-openai}
# url on which embedding model is hosted. If "", Nvidia hosted API is used
APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL:-""}
APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL:-"https://internal.devtest.truefoundry.tech/api/llm/api/inference/openai"}
# url on which llm model is hosted. If "", Nvidia hosted API is used
APP_LLM_SERVERURL: ${APP_LLM_SERVERURL:-""}
APP_LLM_MODELNAME: ${APP_LLM_MODELNAME:-"meta/llama3-8b-instruct"}
APP_LLM_SERVERURL: ${APP_LLM_SERVERURL:-"https://internal.devtest.truefoundry.tech/api/llm/api/inference/openai"}
APP_LLM_MODELNAME: ${APP_LLM_MODELNAME:-"openai-main/gpt-4o"}
# embedding model engine used for inference, supported type nvidia-ai-endpoints
APP_LLM_MODELENGINE: ${APP_LLM_MODELENGINE:-nvidia-ai-endpoints}
NVIDIA_API_KEY: ${NVIDIA_API_KEY}
APP_LLM_MODELENGINE: ${APP_LLM_MODELENGINE:-openai}
# vectorstore collection name to store embeddings
COLLECTION_NAME: ${COLLECTION_NAME:-developer_rag}
COLLECTION_NAME: ${COLLECTION_NAME:-openai-rag}
APP_RETRIEVER_TOPK: 4
APP_RETRIEVER_SCORETHRESHOLD: 0.25
# observability server url
Expand Down Expand Up @@ -81,7 +80,7 @@ services:
APP_SERVERURL: http://chain-server
APP_SERVERPORT: 8081
# model name displayed on UI
APP_MODELNAME: ${APP_LLM_MODELNAME:-"meta/llama3-8b-instruct"}
APP_MODELNAME: ${APP_LLM_MODELNAME:-"openai-main/gpt-4o"}
# observability server url
OTEL_EXPORTER_OTLP_ENDPOINT: http://otel-collector:4317
OTEL_EXPORTER_OTLP_PROTOCOL: grpc
Expand All @@ -96,4 +95,4 @@ services:

networks:
default:
name: nvidia-rag
name: openai-rag
22 changes: 11 additions & 11 deletions RAG/examples/local_deploy/docker-compose-nim-ms.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ services:
expose:
- "8000"
environment:
NGC_API_KEY: ${NGC_API_KEY}
NGC_API_KEY: ${NGC_API_KEY:-1234567890}
shm_size: 20gb
deploy:
resources:
Expand All @@ -37,7 +37,7 @@ services:
expose:
- "8000"
environment:
NGC_API_KEY: ${NGC_API_KEY}
NGC_API_KEY: ${NGC_API_KEY:-1234567890}
user: "${USERID}"
shm_size: 16GB
deploy:
Expand Down Expand Up @@ -65,22 +65,22 @@ services:
expose:
- "8000"
environment:
NGC_API_KEY: ${NGC_API_KEY}
NGC_API_KEY: ${NGC_API_KEY:-1234567890}
user: "${USERID}"
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 10s
timeout: 20s
retries: 100
deploy:
resources:
reservations:
devices:
- driver: nvidia
device_ids: ['${RANKING_MS_GPU_ID:-0}']
capabilities: [gpu]
# deploy:
# resources:
# reservations:
# devices:
# - driver: nvidia
# device_ids: ['${RANKING_MS_GPU_ID:-0}']
# capabilities: [gpu]
profiles: ["nemo-retriever"]

networks:
default:
name: nvidia-rag
name: openai-rag
16 changes: 8 additions & 8 deletions RAG/examples/local_deploy/docker-compose-vectordb.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -74,13 +74,13 @@ services:
depends_on:
- "etcd"
- "minio"
deploy:
resources:
reservations:
devices:
- driver: nvidia
capabilities: ["gpu"]
device_ids: ['${VECTORSTORE_GPU_DEVICE_ID:-0}']
# deploy:
# resources:
# reservations:
# devices:
# - driver: nvidia
# capabilities: ["gpu"]
# device_ids: ['${VECTORSTORE_GPU_DEVICE_ID:-0}']
profiles: ["nemo-retriever", "milvus", ""]

elasticsearch:
Expand Down Expand Up @@ -119,4 +119,4 @@ services:

networks:
default:
name: nvidia-rag
name: openai-rag
31 changes: 25 additions & 6 deletions RAG/src/chain_server/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,12 @@
# limitations under the License.

"""The definition of the application configuration."""
from RAG.src.chain_server.configuration_wizard import ConfigWizard, configclass, configfield

from RAG.src.chain_server.configuration_wizard import (
ConfigWizard,
configclass,
configfield,
)
import os

@configclass
class VectorStoreConfig(ConfigWizard):
Expand Down Expand Up @@ -61,13 +65,18 @@ class LLMConfig(ConfigWizard):
model_engine: str = configfield(
"model_engine",
default="nvidia-ai-endpoints",
help_txt="The server type of the hosted model. Allowed values are nvidia-ai-endpoints",
help_txt="The server type of the hosted model. Allowed values are nvidia-ai-endpoints,openai",
)
model_name_pandas_ai: str = configfield(
"model_name_pandas_ai",
default="ai-mixtral-8x7b-instruct",
help_txt="The name of the ai catalog model to be used with PandasAI agent",
)
api_key: str = configfield(
"api_key",
default=os.getenv("TFY_API_KEY"),
help_txt="API KEY",
)


@configclass
Expand Down Expand Up @@ -104,16 +113,21 @@ class EmbeddingConfig(ConfigWizard):
model_engine: str = configfield(
"model_engine",
default="nvidia-ai-endpoints",
help_txt="The server type of the hosted model. Allowed values are hugginface",
help_txt="The server type of the hosted model. Allowed values are hugginface,openai",
)
dimensions: int = configfield(
"dimensions",
default=1024,
default=1536,
help_txt="The required dimensions of the embedding model. Currently utilized for vector DB indexing.",
)
server_url: str = configfield(
"server_url", default="", help_txt="The url of the server hosting nemo embedding model",
)
api_key: str = configfield(
"api_key",
default=os.getenv("TFY_API_KEY"),
help_txt="API KEY",
)


@configclass
Expand All @@ -129,11 +143,16 @@ class RankingConfig(ConfigWizard):
model_engine: str = configfield(
"model_engine",
default="nvidia-ai-endpoints",
help_txt="The server type of the hosted model. Allowed values are nvidia-ai-endpoints",
help_txt="The server type of the hosted model. Allowed values are nvidia-ai-endpoints,openai",
)
server_url: str = configfield(
"server_url", default="", help_txt="The url of the server hosting nemo Ranking model",
)
api_key: str = configfield(
"api_key",
default=os.getenv("TFY_API_KEY"),
help_txt="API KEY",
)


@configclass
Expand Down
2 changes: 2 additions & 0 deletions RAG/src/chain_server/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ llama-index-llms-langchain==0.1.3
llama-index-embeddings-langchain==0.1.2
llama-index-vector-stores-milvus==0.1.6
llama-index-vector-stores-postgres==0.1.5
llama-index-llms-openai-like
langchain-openai>=0.0.2
pymilvus==2.4.0
dataclass-wizard==0.22.3
opencv-python==4.8.0.74
Expand Down
68 changes: 64 additions & 4 deletions RAG/src/chain_server/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional
from urllib.parse import urlparse

import httpx

SSL_VERIFY = os.getenv("SSL_VERIFY", "true")


import yaml

logger = logging.getLogger(__name__)
Expand All @@ -45,7 +50,10 @@
from llama_index.core.indices import VectorStoreIndex
from llama_index.core.postprocessor.types import BaseNodePostprocessor
from llama_index.core.schema import MetadataMode
from llama_index.core.service_context import ServiceContext, set_global_service_context
from llama_index.core.service_context import (
ServiceContext,
set_global_service_context,
)
from llama_index.core.utils import get_tokenizer, globals_helper
from llama_index.embeddings.langchain import LangchainEmbedding
from llama_index.llms.langchain import LangChainLLM
Expand Down Expand Up @@ -94,15 +102,17 @@
from langchain_core.documents.compressor import BaseDocumentCompressor
from langchain_core.embeddings import Embeddings
from langchain_core.language_models.chat_models import SimpleChatModel
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from llama_index.core import Settings
from llama_index.llms.openai_like import OpenAILike
from openai import OpenAI

from RAG.src.chain_server import configuration

if TYPE_CHECKING:
from RAG.src.chain_server.configuration_wizard import ConfigWizard

DEFAULT_MAX_CONTEXT = 1500


class LimitRetrievedNodesLength(BaseNodePostprocessor):
"""Llama Index chain filter to limit token lengths."""

Expand Down Expand Up @@ -397,6 +407,26 @@ def get_llm(**kwargs) -> LLM | SimpleChatModel:
top_p=kwargs.get('top_p', None),
max_tokens=kwargs.get('max_tokens', None),
)
elif settings.llm.model_engine == "openai":
logger.info(f"Using llm model {settings.llm.model_name} from api catalog")

llm = ChatOpenAI(
model=settings.llm.model_name,
default_headers={
"X-TFY-METADATA": '{"tfy_log_request":"true"}',
},
base_url=settings.llm.server_url,
api_key=settings.llm.api_key,
async_client=OpenAI(
api_key=settings.llm.api_key,
base_url=settings.llm.server_url,
http_client=httpx.Client(verify=SSL_VERIFY == "true"),
),
http_client=httpx.Client(verify=SSL_VERIFY == "true"),
# is_chat_model=True
)
return llm

else:
raise RuntimeError(
"Unable to find any supported Large Language Model server. Supported engine name is nvidia-ai-endpoints."
Expand Down Expand Up @@ -438,6 +468,22 @@ def get_embedding_model() -> Embeddings:
else:
logger.info(f"Using embedding model {settings.embeddings.model_name} hosted at api catalog")
return NVIDIAEmbeddings(model=settings.embeddings.model_name, truncate="END")
elif settings.llm.model_engine == "openai":
embeddings = OpenAIEmbeddings(
base_url=settings.embeddings.server_url,
api_key=settings.embeddings.api_key,
model=settings.embeddings.model_name,
default_headers={
"X-TFY-METADATA": '{"tfy_log_request":"true"}',
},
async_client=OpenAI(
api_key=settings.embeddings.api_key,
base_url=settings.embeddings.server_url,
http_client=httpx.Client(verify=SSL_VERIFY == "true"),
),
http_client=httpx.Client(verify=SSL_VERIFY == "true"),
)
return embeddings
else:
raise RuntimeError(
"Unable to find any supported embedding model. Supported engine is huggingface and nvidia-ai-endpoints."
Expand All @@ -459,11 +505,25 @@ def get_ranking_model() -> BaseDocumentCompressor:
if settings.ranking.server_url:
logger.info(f"Using ranking model hosted at {settings.ranking.server_url}")
return NVIDIARerank(
base_url=f"http://{settings.ranking.server_url}/v1", top_n=settings.retriever.top_k, truncate="END"
base_url=f"{settings.ranking.server_url}/v1",
top_n=settings.retriever.top_k,
truncate="END",
)
elif settings.ranking.model_name:
logger.info(f"Using ranking model {settings.ranking.model_name} hosted at api catalog")
return NVIDIARerank(model=settings.ranking.model_name, top_n=settings.retriever.top_k, truncate="END")
elif settings.ranking.model_engine == "openai":
logger.info(f"Using ranking model {settings.ranking.model_name} from api catalog")
llm = OpenAILike(
model=settings.ranking.model_name,
api_base=settings.ranking.server_url,
api_key=settings.ranking.api_key,
default_headers={
"X-TFY-METADATA": '{"tfy_log_request":"true"}',
},
http_client=httpx.AsyncClient(verify=SSL_VERIFY == "true"),
)
return llm
else:
logger.warning("Unable to find any supported ranking model. Supported engine is nvidia-ai-endpoints.")
except Exception as e:
Expand Down
3 changes: 2 additions & 1 deletion RAG/src/rag_playground/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
PyYAML==6.0.1
dataclass-wizard==0.22.3
gradio==4.43.0
gradio==4.44.1
jinja2==3.1.3
numpy==1.26.4
opentelemetry-api==1.23.0
opentelemetry-exporter-otlp-proto-grpc==1.23.0
opentelemetry-sdk==1.23.0
protobuf==4.25.3
pycountry==23.12.11
pydantic==2.10.6
tritonclient[all]==2.43.0
uvicorn==0.27.1
1 change: 1 addition & 0 deletions RAG/src/rag_playground/speech/pages/converse.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ def build_page(client: chat_client.ChatClient) -> gr.Blocks:
show_label=False,
container=False,
elem_id="microphone",
api_name=False,
)

# user feedback
Expand Down