Skip to content

rag.chunkers.text_chunker

Text chunker for the RAG module

BaseChunker

Bases: BaseModel, ABC

Base class for chunkers.

Example:

from mirascope.rag import BaseChunker, Document


class TextChunker(BaseChunker):
    chunk_size: int
    chunk_overlap: int

    def chunk(self, text: str) -> list[Document]:
        chunks: list[Document] = []
        start: int = 0
        while start < len(text):
            end: int = min(start + self.chunk_size, len(text))
            chunks.append(Document(text=text[start:end], id=str(uuid.uuid4())))
            start += self.chunk_size - self.chunk_overlap
        return chunks
Source code in mirascope/rag/chunkers/base_chunker.py
class BaseChunker(BaseModel, ABC):
    """Base class for chunkers.

    Example:

    ```python
    from mirascope.rag import BaseChunker, Document


    class TextChunker(BaseChunker):
        chunk_size: int
        chunk_overlap: int

        def chunk(self, text: str) -> list[Document]:
            chunks: list[Document] = []
            start: int = 0
            while start < len(text):
                end: int = min(start + self.chunk_size, len(text))
                chunks.append(Document(text=text[start:end], id=str(uuid.uuid4())))
                start += self.chunk_size - self.chunk_overlap
            return chunks
    ```
    """

    @abstractmethod
    def chunk(self, text: str) -> list[Document]:
        """Returns a Document that contains an id, text, and optionally metadata."""
        ...  # pragma: no cover

chunk(text) abstractmethod

Returns a Document that contains an id, text, and optionally metadata.

Source code in mirascope/rag/chunkers/base_chunker.py
@abstractmethod
def chunk(self, text: str) -> list[Document]:
    """Returns a Document that contains an id, text, and optionally metadata."""
    ...  # pragma: no cover

Document

Bases: BaseModel

A document to be added to the vectorstore.

Source code in mirascope/rag/types.py
class Document(BaseModel):
    """A document to be added to the vectorstore."""

    id: str
    text: str
    metadata: Optional[dict[str, Any]] = None

TextChunker

Bases: BaseChunker

A text chunker that splits a text into chunks of a certain size and overlaps.

Example:

from mirascope.rag import TextChunker

text_chunker = TextChunker(chunk_size=1000, chunk_overlap=200)
chunks = text_chunker.chunk("This is a long text that I want to split into chunks.")
print(chunks)
Source code in mirascope/rag/chunkers/text_chunker.py
class TextChunker(BaseChunker):
    """A text chunker that splits a text into chunks of a certain size and overlaps.

    Example:

    ```python
    from mirascope.rag import TextChunker

    text_chunker = TextChunker(chunk_size=1000, chunk_overlap=200)
    chunks = text_chunker.chunk("This is a long text that I want to split into chunks.")
    print(chunks)
    ```
    """

    chunk_size: int
    chunk_overlap: int

    def chunk(self, text: str) -> list[Document]:
        chunks: list[Document] = []
        start: int = 0
        while start < len(text):
            end: int = min(start + self.chunk_size, len(text))
            chunks.append(Document(text=text[start:end], id=str(uuid.uuid4())))
            start += self.chunk_size - self.chunk_overlap
        return chunks