Vector Store Guide¶

Complete guide to using the CockroachDB vector store.

Overview¶

The AsyncCockroachDBVectorStore provides a high-performance interface for storing and searching vector embeddings in CockroachDB.

Basic Usage¶

Initialization¶

from langchain_cockroachdb import AsyncCockroachDBVectorStore, CockroachDBEngine
from langchain_openai import OpenAIEmbeddings

# Create engine
engine = CockroachDBEngine.from_connection_string(
    "cockroachdb://user:pass@host:26257/db"
)

# Initialize table
await engine.ainit_vectorstore_table(
    table_name="documents",
    vector_dimension=1536,
)

# Create vector store
vectorstore = AsyncCockroachDBVectorStore(
    engine=engine,
    embeddings=OpenAIEmbeddings(),
    collection_name="documents",
)

Adding Documents¶

From Text¶

# Simple text
texts = [
    "CockroachDB is a distributed database",
    "LangChain simplifies LLM applications",
]
ids = await vectorstore.aadd_texts(texts)
print(f"Added {len(ids)} documents")

With Metadata¶

texts = ["Doc 1", "Doc 2", "Doc 3"]
metadatas = [
    {"source": "web", "category": "tech", "year": 2024},
    {"source": "pdf", "category": "science", "year": 2023},
    {"source": "api", "category": "tech", "year": 2024},
]

ids = await vectorstore.aadd_texts(texts, metadatas=metadatas)

From Documents¶

from langchain_core.documents import Document

documents = [
    Document(
        page_content="Content here",
        metadata={"source": "file.txt", "page": 1}
    ),
    Document(
        page_content="More content",
        metadata={"source": "file.txt", "page": 2}
    ),
]

ids = await vectorstore.aadd_documents(documents)

With Custom IDs¶

import uuid

custom_ids = [str(uuid.uuid4()) for _ in range(len(texts))]
ids = await vectorstore.aadd_texts(texts, ids=custom_ids)

Batch Size Control¶

# Override default batch size at runtime
ids = await vectorstore.aadd_texts(
    large_text_list,
    batch_size=500  # Larger batches for throughput
)

Searching¶

Similarity Search¶

# Basic search
results = await vectorstore.asimilarity_search(
    "distributed databases",
    k=5
)

for doc in results:
    print(f"Content: {doc.page_content}")
    print(f"Metadata: {doc.metadata}")

With Scores¶

results = await vectorstore.asimilarity_search_with_score(
    "distributed databases",
    k=5
)

for doc, score in results:
    print(f"Score: {score:.4f}")
    print(f"Content: {doc.page_content}")

By Vector¶

# Search with your own vector
embedding = embeddings.embed_query("my query")
results = await vectorstore.asimilarity_search_by_vector(
    embedding,
    k=5
)

With Relevance Scores¶

results = await vectorstore.asimilarity_search_with_relevance_scores(
    "query",
    k=5,
    score_threshold=0.7  # Only return scores above threshold
)

Filtering¶

Basic Filters¶

# Equality
results = await vectorstore.asimilarity_search(
    "tech content",
    k=5,
    filter={"category": "tech"}
)

# Multiple conditions (implicit AND)
results = await vectorstore.asimilarity_search(
    "recent tech",
    k=5,
    filter={"category": "tech", "year": 2024}
)

Advanced Filters¶

Comparison Operators¶

# Greater than
filter={"year": {"$gt": 2020}}

# Greater than or equal
filter={"year": {"$gte": 2020}}

# Less than
filter={"score": {"$lt": 0.9}}

# Less than or equal
filter={"score": {"$lte": 0.9}}

# Not equal
filter={"status": {"$ne": "archived"}}

IN and NOT IN¶

# IN operator
filter={"category": {"$in": ["tech", "science", "engineering"]}}

# NOT IN operator
filter={"status": {"$nin": ["draft", "archived"]}}

Logical Operators¶

# AND (explicit)
filter={
    "$and": [
        {"category": "tech"},
        {"year": {"$gte": 2023}}
    ]
}

# OR
filter={
    "$or": [
        {"category": "tech"},
        {"category": "science"}
    ]
}

# Complex nested
filter={
    "$and": [
        {"year": {"$gte": 2023}},
        {
            "$or": [
                {"category": "tech"},
                {"category": "science"}
            ]
        }
    ]
}

Filter Examples¶

# Recent tech or science documents
results = await vectorstore.asimilarity_search(
    "innovation",
    k=10,
    filter={
        "$and": [
            {"year": {"$gte": 2023}},
            {"category": {"$in": ["tech", "science"]}}
        ]
    }
)

# High-quality documents from specific sources
results = await vectorstore.asimilarity_search(
    "research",
    k=10,
    filter={
        "$and": [
            {"source": {"$in": ["arxiv", "nature", "ieee"]}},
            {"quality_score": {"$gte": 0.8}}
        ]
    }
)

Updating and Deleting¶

Update Documents¶

# Re-add with same ID to update
await vectorstore.aadd_texts(
    ["Updated content"],
    ids=["existing-id"]
)

Delete by IDs¶

ids_to_delete = ["id1", "id2", "id3"]
await vectorstore.adelete(ids=ids_to_delete)

Delete All¶

# Delete all documents in collection
await vectorstore.adelete()

Retriever Interface¶

As Retriever¶

# Convert to LangChain retriever
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 5}
)

# Use in chains
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI

qa = RetrievalQA.from_chain_type(
    llm=ChatOpenAI(),
    chain_type="stuff",
    retriever=retriever,
)

answer = await qa.ainvoke("What is CockroachDB?")

MMR (Maximal Marginal Relevance)¶

# Diverse results (less similar to each other)
retriever = vectorstore.as_retriever(
    search_type="mmr",
    search_kwargs={
        "k": 10,
        "fetch_k": 50,  # Fetch more candidates
        "lambda_mult": 0.5  # Balance relevance vs diversity
    }
)

# Or directly
results = await vectorstore.amax_marginal_relevance_search(
    "query",
    k=10,
    fetch_k=50,
    lambda_mult=0.5
)

Similarity Score Threshold¶

# Only return highly relevant results
retriever = vectorstore.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={
        "k": 10,
        "score_threshold": 0.8
    }
)

Distance Strategies¶

Choose distance metric based on your embeddings:

from langchain_cockroachdb import DistanceStrategy

# Cosine similarity (default, normalized vectors)
vectorstore = AsyncCockroachDBVectorStore(
    engine=engine,
    embeddings=embeddings,
    collection_name="docs",
    distance_strategy=DistanceStrategy.COSINE
)

# Euclidean distance (L2)
vectorstore = AsyncCockroachDBVectorStore(
    engine=engine,
    embeddings=embeddings,
    collection_name="docs",
    distance_strategy=DistanceStrategy.EUCLIDEAN
)

# Inner product
vectorstore = AsyncCockroachDBVectorStore(
    engine=engine,
    embeddings=embeddings,
    collection_name="docs",
    distance_strategy=DistanceStrategy.INNER_PRODUCT
)

Which to Use?¶

Distance Strategy	Best For	Normalized?
Cosine	Most embeddings (OpenAI, Anthropic)	Yes
Euclidean (L2)	Spatial data, distances matter	No
Inner Product	Pre-normalized vectors, speed priority	Yes

Table Management¶

Drop Table¶

await vectorstore.adrop_table()

Check if Table Exists¶

async with engine.engine.connect() as conn:
    from sqlalchemy import text
    result = await conn.execute(
        text("""
            SELECT EXISTS (
                SELECT FROM information_schema.tables 
                WHERE table_name = 'documents'
            )
        """)
    )
    exists = result.scalar()

Performance Tips¶

1. Use Appropriate Batch Sizes¶

# Smaller embeddings
vectorstore = AsyncCockroachDBVectorStore(
    engine=engine,
    embeddings=embeddings,
    collection_name="docs",
    batch_size=500  # < 512 dims
)

# Larger embeddings
vectorstore = AsyncCockroachDBVectorStore(
    engine=engine,
    embeddings=embeddings,
    collection_name="docs",
    batch_size=100  # > 1024 dims
)

2. Create Indexes¶

from langchain_cockroachdb import CSPANNIndex

# Create vector index for faster queries
index = CSPANNIndex()
await vectorstore.aapply_vector_index(index)

See Vector Indexes Guide for details.

3. Use Connection Pooling¶

engine = CockroachDBEngine.from_connection_string(
    connection_string,
    pool_size=20,      # More connections for concurrency
    max_overflow=40,
)

4. Filter Early¶

# Good: Filter reduces candidates before vector search
results = await vectorstore.asimilarity_search(
    "query",
    k=10,
    filter={"category": "tech"}  # Narrows search space
)

# Less efficient: No filter means searching all vectors
results = await vectorstore.asimilarity_search(
    "query",
    k=10
)
# Then filter results in Python

Common Patterns¶

Multi-Tenant Isolation¶

# Store tenant_id in metadata
await vectorstore.aadd_texts(
    texts,
    metadatas=[{"tenant_id": "tenant-123"}] * len(texts)
)

# Query with tenant filter
results = await vectorstore.asimilarity_search(
    "query",
    k=5,
    filter={"tenant_id": "tenant-123"}
)

Versioning Documents¶

# Include version in metadata
await vectorstore.aadd_texts(
    ["Document content"],
    metadatas=[{"doc_id": "abc123", "version": 2}]
)

# Query latest version
results = await vectorstore.asimilarity_search(
    "query",
    k=5,
    filter={"version": {"$gte": 2}}
)

Source Attribution¶

# Track sources
metadatas = [
    {
        "source": "https://example.com/page1",
        "title": "Page Title",
        "author": "John Doe",
        "date": "2024-01-15"
    }
]

await vectorstore.aadd_texts(texts, metadatas=metadatas)

# Search and cite sources
results = await vectorstore.asimilarity_search_with_score(
    "query", k=3
)

for doc, score in results:
    print(f"Source: {doc.metadata['source']}")
    print(f"Relevance: {score:.2f}")

Multi-Tenancy (Namespaces)¶

Isolate documents by tenant within a single table:

# Create table with namespace column (opt-in)
await engine.ainit_vectorstore_table(
    table_name="documents",
    vector_dimension=1536,
    namespace_column="namespace",
)

# Per-tenant stores
store_a = AsyncCockroachDBVectorStore(
    engine=engine, embeddings=embeddings,
    collection_name="documents", namespace="tenant-a",
)
store_b = AsyncCockroachDBVectorStore(
    engine=engine, embeddings=embeddings,
    collection_name="documents", namespace="tenant-b",
)

# All operations (search, delete, get_by_ids) are scoped to the namespace
await store_a.aadd_texts(["Tenant A doc"])
results = await store_a.asimilarity_search("doc")  # Only tenant A's docs

See the Multi-Tenancy Guide for the full guide including C-SPANN prefix column indexing.

Error Handling¶

import asyncio

try:
    results = await vectorstore.asimilarity_search("query", k=5)
except Exception as e:
    print(f"Search failed: {e}")
    # Retry logic is automatic, but you can catch final failure

Next Steps¶

Multi-Tenancy - Namespace-based tenant isolation
Vector Indexes - Optimize query performance
Hybrid Search - Combine FTS with vectors
Configuration - Tune for your workload
LangChain Official: CockroachDB Vector Store