feat: Add RAGEngine CRD (#597)

This PR adds the initial draft for the RAGEngine CRD in Kaito. A RAGEngine CRD defines all resources needed to run a RAG on top of a LLM inference service. Upon creating a RAGEngine CR, a new controller will create a deployment which runs a RAG engine instance. The instance provides http endpoints for both `index` and `query` services. The instance can optionally choose a public model embedding service or run a local embedding model with GPU to convert the input index data to vectors. The instance can also connect to a Vector DB instance to persist the vectors db or by default using an in-memory vector DB. The instance uses the `llamaIndex` library to orchestrate the workflow. When RAGEngine instance is up and running, users should send questions to the `query` endpoint of RAG instance instead of the normal `chat` endpoint in the inference service. The RAGEngine is intended to be "standalone". It can use any public inference service or inference services hosted by Kaito workspace. The RAG engine instance is designed to help retrieve prompts from unstructured data (arbitrary index data provided by the users). Retrieving from structured data or search engine is out of the scope for now.
Azure · Sep 18, 2024 · 2cdc682 · 2cdc682
1 parent f3d6e09
commit 2cdc682
Show file tree

Hide file tree

Showing 7 changed files with 591 additions and 8 deletions.
diff --git a/api/v1alpha1/ragengine_types.go b/api/v1alpha1/ragengine_types.go
@@ -0,0 +1,109 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+package v1alpha1
+
+import (
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+)
+
+type StorageSpec struct {
+	//TODO: add vendor specific APIs for accessing vector DB services here.
+}
+
+type RemoteEmbeddingSpec struct {
+	// URL points to a publicly available embedding service, such as OpenAI.
+	URL string `json:"url"`
+	// AccessSecret is the name of the secret that contains the service access token.
+	// +optional
+	AccessSecret string `json:"accessSecret,omitempty"`
+}
+
+type LocalEmbeddingSpec struct {
+	// Image is the name of the containerized embedding model image.
+	// +optional
+	Image string `json:"image,omitempty"`
+	// +optional
+	ImagePullSecret string `json:"imagePullSecret,omitempty"`
+	// ModelID is the ID of the embedding model hosted by huggingface, e.g., BAAI/bge-small-en-v1.5.
+	// When this field is specified, the RAG engine will download the embedding model
+	// from huggingface repository during startup. The embedding model will not persist in local storage.
+	// Note that if Image is specified, ModelID should not be specified and vice versa.
+	// +optional
+	ModelID string `json:"modelID,omitempty"`
+	// ModelAccessSecret is the name of the secret that contains the huggingface access token.
+	// +optional
+	ModelAccessSecret string `json:"modelAccessSecret,omitempty"`
+}
+
+type EmbeddingSpec struct {
+	// Remote specifies how to generate embeddings for index data using a remote service.
+	// Note that either Remote or Local needs to be specified, not both.
+	// +optional
+	Remote *RemoteEmbeddingSpec `json:"remote,omitempty"`
+	// Local specifies how to generate embeddings for index data using a model run locally.
+	// +optional
+	Local *LocalEmbeddingSpec `json:"local,omitempty"`
+}
+
+type InferenceServiceSpec struct {
+	// URL points to a running inference service endpoint which accepts http(s) payload.
+	URL string `json:"url"`
+	// AccessSecret is the name of the secret that contains the service access token.
+	// +optional
+	AccessSecret string `json:"accessSecret,omitempty"`
+}
+
+type RAGEngineSpec struct {
+	// Compute specifies the dedicated GPU resource used by an embedding model running locally if required.
+	// +optional
+	Compute *ResourceSpec `json:"compute,omitempty"`
+	// Storage specifies how to access the vector database used to save the embedding vectors.
+	// If this field is not specified, by default, an in-memory vector DB will be used.
+	// The data will not be persisted.
+	// +optional
+	Storage *StorageSpec `json:"storage,omitempty"`
+	// Embedding specifies whether the RAG engine generates embedding vectors using a remote service
+	// or using a embedding model running locally.
+	Embedding        *EmbeddingSpec        `json:"embedding"`
+	InferenceService *InferenceServiceSpec `json:"inferenceService"`
+	// QueryServiceName is the name of the service which exposes the endpoint for accepting user queries to the
+	// inference service. If not specified, a default service name will be created by the RAG engine.
+	// +optional
+	QueryServiceName string `json:"queryServiceName,omitempty"`
+	// IndexServiceName is the name of the service which exposes the endpoint for user to input the index data
+	// to generate embeddings. If not specified, a default service name will be created by the RAG engine.
+	// +optional
+	IndexServiceName string `json:"indexServiceName,omitempty"`
+}
+
+// RAGEngineStatus defines the observed state of RAGEngine
+type RAGEngineStatus struct {
+	Conditions []metav1.Condition `json:"conditions,omitempty"`
+}
+
+// RAGEngine is the Schema for the ragengine API
+// +kubebuilder:object:root=true
+// +kubebuilder:subresource:status
+// +kubebuilder:resource:path=ragengines,scope=Namespaced,categories=ragengine
+// +kubebuilder:storageversion
+type RAGEngine struct {
+	metav1.TypeMeta   `json:",inline"`
+	metav1.ObjectMeta `json:"metadata,omitempty"`
+
+	Spec *RAGEngineSpec `json:"spec,omitempty"`
+
+	Status RAGEngineStatus `json:"status,omitempty"`
+}
+
+// RAGEngineList contains a list of RAGEngine
+// +kubebuilder:object:root=true
+type RAGEngineList struct {
+	metav1.TypeMeta `json:",inline"`
+	metav1.ListMeta `json:"metadata,omitempty"`
+	Items           []RAGEngine `json:"items"`
+}
+
+func init() {
+	SchemeBuilder.Register(&RAGEngine{}, &RAGEngineList{})
+}
diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go