chore: initial public snapshot for github upload

2026-03-26 20:06:14 +08:00
commit 0e5ecd930e
3497 changed files with 1586236 additions and 0 deletions
--- a/llm-gateway-competitors/litellm-wheel-src/litellm/rag/text_splitters/init.py
+++ b/llm-gateway-competitors/litellm-wheel-src/litellm/rag/text_splitters/init.py
@@ -0,0 +1,9 @@
+"""
+Text splitting utilities for RAG ingestion.
+"""
+
+from litellm.rag.text_splitters.recursive_character_text_splitter import (
+    RecursiveCharacterTextSplitter,
+)
+
+__all__ = ["RecursiveCharacterTextSplitter"]
--- a/llm-gateway-competitors/litellm-wheel-src/litellm/rag/text_splitters/recursive_character_text_splitter.py
+++ b/llm-gateway-competitors/litellm-wheel-src/litellm/rag/text_splitters/recursive_character_text_splitter.py
@@ -0,0 +1,141 @@
+"""
+RecursiveCharacterTextSplitter for RAG ingestion.
+
+A simple implementation that splits text recursively by different separators.
+"""
+
+from typing import List, Optional
+
+from litellm.constants import DEFAULT_CHUNK_OVERLAP, DEFAULT_CHUNK_SIZE
+
+
+class RecursiveCharacterTextSplitter:
+    """
+    Split text recursively by different separators.
+
+    Tries to split by the first separator, then recursively splits
+    by subsequent separators if chunks are still too large.
+    """
+
+    def __init__(
+        self,
+        chunk_size: int = DEFAULT_CHUNK_SIZE,
+        chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
+        separators: Optional[List[str]] = None,
+    ):
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+        self.separators = separators or ["\n\n", "\n", " ", ""]
+
+    def split_text(self, text: str) -> List[str]:
+        """Split text into chunks."""
+        return self._split_text(text, self.separators)
+
+    def _split_text(
+        self, text: str, separators: List[str], depth: int = 0
+    ) -> List[str]:
+        """Recursively split text using separators."""
+        from litellm.constants import DEFAULT_MAX_RECURSE_DEPTH
+
+        if depth > DEFAULT_MAX_RECURSE_DEPTH:
+            # Max depth reached, return text as-is split into chunk_size pieces
+            return [
+                text[i : i + self.chunk_size]
+                for i in range(0, len(text), self.chunk_size)
+            ]
+
+        final_chunks: List[str] = []
+
+        # Get the appropriate separator
+        separator = separators[-1]
+        new_separators: List[str] = []
+
+        for i, sep in enumerate(separators):
+            if sep == "":
+                separator = sep
+                break
+            if sep in text:
+                separator = sep
+                new_separators = separators[i + 1 :]
+                break
+
+        # Split by the chosen separator
+        if separator:
+            splits = text.split(separator)
+        else:
+            splits = list(text)
+
+        # Merge splits into chunks
+        good_splits: List[str] = []
+        for split in splits:
+            if len(split) < self.chunk_size:
+                good_splits.append(split)
+            else:
+                # Chunk is too big, merge what we have and recurse
+                if good_splits:
+                    merged = self._merge_splits(good_splits, separator)
+                    final_chunks.extend(merged)
+                    good_splits = []
+
+                if new_separators:
+                    # Recursively split with finer separators
+                    other_chunks = self._split_text(split, new_separators, depth + 1)
+                    final_chunks.extend(other_chunks)
+                else:
+                    # No more separators, force split
+                    final_chunks.extend(self._force_split(split))
+
+        # Merge remaining good splits
+        if good_splits:
+            merged = self._merge_splits(good_splits, separator)
+            final_chunks.extend(merged)
+
+        return final_chunks
+
+    def _merge_splits(self, splits: List[str], separator: str) -> List[str]:
+        """Merge splits into chunks respecting chunk_size and chunk_overlap."""
+        chunks: List[str] = []
+        current_chunk: List[str] = []
+        current_length = 0
+
+        for split in splits:
+            split_len = len(split)
+            sep_len = len(separator) if current_chunk else 0
+
+            if current_length + split_len + sep_len > self.chunk_size:
+                if current_chunk:
+                    chunk_text = separator.join(current_chunk).strip()
+                    if chunk_text:
+                        chunks.append(chunk_text)
+
+                    # Handle overlap
+                    while (
+                        current_length > self.chunk_overlap and len(current_chunk) > 1
+                    ):
+                        removed = current_chunk.pop(0)
+                        current_length -= len(removed) + len(separator)
+
+            current_chunk.append(split)
+            current_length += split_len + sep_len
+
+        # Add remaining
+        if current_chunk:
+            chunk_text = separator.join(current_chunk).strip()
+            if chunk_text:
+                chunks.append(chunk_text)
+
+        return chunks
+
+    def _force_split(self, text: str) -> List[str]:
+        """Force split text by chunk_size when no separator works."""
+        chunks: List[str] = []
+        start = 0
+
+        while start < len(text):
+            end = start + self.chunk_size
+            chunk = text[start:end].strip()
+            if chunk:
+                chunks.append(chunk)
+            start = end - self.chunk_overlap if end < len(text) else len(text)
+
+        return chunks