Nvidia’s NV-Embed-QA model delivers 1 024-dimensional sentence vectors that work nicely with SurrealDB’s built-in K-nearest-neighbour search (HNSW, brute-force or M-Tree). The walkthrough below shows how to store and query those vectors without Qdrant, using only SurrealDB and the Nvidia Retrieval API.
# SurrealDB Python SDK (async) + requests for Nvidia API calls pip install surrealdb requests
Run a local SurrealDB instance first, e.g.
surreal start --user root --pass root --bind 0.0.0.0:8000 file:/data/db
import os import requests from typing import List, Dict, Any NVIDIA_BASE = "https://ai.api.nvidia.com/v1/retrieval/nvidia/embeddings" NVIDIA_API_KEY = os.environ["NVIDIA_API_KEY"] # set in your shell session = requests.Session() headers = { "Authorization": f"Bearer {NVIDIA_API_KEY}", "Accept": "application/json", "Content-Type": "application/json", }
texts = [ "CUDA 12 adds cooperative groups on all SM architectures.", "TensorRT 10 delivers real-time LLM inference on Hopper GPUs.", "NVLink enables high-bandwidth inter-GPU communication in DGX systems.", ]
def get_embeddings(inputs: List[str], input_type: str = "passage") -> List[List[float]]: """Get embeddings from Nvidia API.""" payload = { "input": inputs, "input_type": input_type, "model": "NV-Embed-QA", } resp = session.post(NVIDIA_BASE, headers=headers, json=payload).json() return [row["embedding"] for row in resp["data"]] # Get embeddings for all texts vectors = get_embeddings(texts)
import asyncio from surrealdb import AsyncSurreal TABLE = "GpuDocs" # our table name async def ingest(): async with AsyncSurreal("ws://localhost:8000/rpc") as db: await db.signin({"username": "root", "password": "root"}) await db.use("test", "test") # <namespace>, <database> # ---- idempotent schema & HNSW index ---- await db.query(""" DEFINE TABLE IF NOT EXISTS $table; DEFINE FIELD text ON $table TYPE string; DEFINE FIELD embedding ON $table TYPE array; DEFINE INDEX hnsw_embed ON $table FIELDS embedding HNSW DIMENSION 1024; """, {"table": TABLE}) # ---- insert the rows ------------------- await db.create(TABLE, [ {"id": f"doc:{i}", "text": t, "embedding": v} for i, (t, v) in enumerate(zip(texts, vectors)) ]) asyncio.run(ingest())
async def search(query: str, k: int = 3) -> List[Dict[str, Any]]: """Search for similar documents using vector similarity.""" # ----- embed the query ------------------ q_vec = get_embeddings([query], input_type="query")[0] # ----- SurrealDB KNN query -------------- async with AsyncSurreal("ws://localhost:8000/rpc") as db: await db.signin({"username": "root", "password": "root"}) await db.use("test", "test") result = await db.query(""" LET $q = $vec; SELECT text, vector::distance::cosine(embedding, $q) AS score FROM $table WHERE embedding <|$k|> $q -- K-nearest-neighbour ORDER BY score; -- lower = more similar """, { "vec": q_vec, "k": k, "table": TABLE }) return result[0]["result"] # Example usage matches = asyncio.run(search("How do I speed up LLM inference on GPUs?")) for row in matches: # pretty-print print(f"{row['score']:.4f} {row['text']}")
Expected console output (shortened):
0.1378 TensorRT 10 delivers real-time LLM inference on Hopper GPUs. 0.4631 CUDA 12 adds cooperative groups on all SM architectures. 0.5247 NVLink enables high-bandwidth inter-GPU communication in DGX systems.
// npm i surrealdb.js node-fetch import Surreal from 'surrealdb.js'; import fetch from 'node-fetch'; const db = new Surreal('ws://localhost:8000/rpc'); await db.connect(); await db.signin({ user: 'root', pass: 'root' }); await db.use('test', 'test'); const NVIDIA_BASE = 'https://ai.api.nvidia.com/v1/retrieval/nvidia/embeddings'; const headers = { 'Authorization': `Bearer ${process.env.NVIDIA_API_KEY}`, 'Accept': 'application/json', 'Content-Type': 'application/json', }; const passages = [ 'CUDA 12 adds cooperative groups on all SM architectures.', 'TensorRT 10 delivers real-time LLM inference on Hopper GPUs.', ]; const embRes = await fetch(NVIDIA_BASE, { method: 'POST', headers, body: JSON.stringify({ input: passages, input_type: 'passage', model: 'NV-Embed-QA', }), }); const vectors = (await embRes.json()).data.map((d:any)=>d.embedding); // schema & index await db.query(` DEFINE TABLE IF NOT EXISTS GpuDocs; DEFINE FIELD text ON GpuDocs TYPE string; DEFINE FIELD embedding ON GpuDocs TYPE array; DEFINE INDEX hnsw_embed ON GpuDocs FIELDS embedding HNSW DIMENSION 1024; `); // insert await db.create('GpuDocs', passages.map((p,i)=>({ id:`doc:${i}`, text:p, embedding:vectors[i], }))); // embed a query & search const qRes = await fetch(NVIDIA_BASE, { method:'POST', headers, body:JSON.stringify({input:'GPU inference acceleration',input_type:'query',model:'NV-Embed-QA'}), }); const qVec = (await qRes.json()).data[0].embedding; const {0:matches} = await db.query(` LET $q = $vec; SELECT text, vector::distance::cosine(embedding, $q) AS score FROM GpuDocs WHERE embedding <|2|> $q ORDER BY score; `, {vec:qVec}); console.log(matches);
Feature | SurrealQL | Notes |
---|---|---|
Vector field | embedding array | Any array of floats works. |
Index | DEFINE INDEX … HNSW DIMENSION 1024 | Re-run REBUILD INDEX after large bulk loads. |
KNN search | WHERE embedding <|k|> $vec | Supports optional metric: <|k,COSINE|> |
Distance functions | vector::distance::cosine(…) | Also euclidean , manhattan , etc. |
Using SurrealDB lets you keep metadata, relational links and time-series data side-by-side with your vectors—no extra service layer required.