feat: Add limit parameter to operations

cursoragent · shreyashankar · cursoragent · commit 8fd8ce772e26 · 2025-11-28T21:13:05.000Z
Adds a limit parameter to Extract, Map, Filter, and Reduce operations to control the number of processed items.

Co-authored-by: ss.shankar505 &lt;ss.shankar505@gmail.com&gt;
diff --git a/docetl/operations/extract.py b/docetl/operations/extract.py
@@ -26,6 +26,7 @@ class schema(BaseOperation.schema):
         timeout: int | None = None
         skip_on_error: bool = False
         litellm_completion_kwargs: dict[str, Any] = Field(default_factory=dict)
+        limit: int | None = Field(None, gt=0)
 
         @field_validator("prompt")
         def validate_prompt(cls, v):
@@ -392,6 +393,10 @@ def execute(self, input_data: list[dict]) -> tuple[list[dict], float]:
         Returns:
             tuple[list[dict], float]: A tuple containing the processed data and the total cost of the operation.
         """
+        limit_value = self.config.get("limit")
+        if limit_value is not None:
+            input_data = input_data[:limit_value]
+
         if not input_data:
             return [], 0.0
 
diff --git a/docetl/operations/filter.py b/docetl/operations/filter.py
@@ -33,6 +33,30 @@ def validate_filter_output_schema(self):
 
             return self
 
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._filter_key = next(
+            iter(
+                [
+                    k
+                    for k in self.config["output"]["schema"].keys()
+                    if k != "_short_explanation"
+                ]
+            )
+        )
+        self._filter_is_build = False
+
+    def _limit_applies_to_inputs(self) -> bool:
+        return False
+
+    def _handle_result(self, result: dict[str, Any]) -> tuple[dict | None, bool]:
+        keep_record = bool(result.get(self._filter_key))
+        result.pop(self._filter_key, None)
+
+        if self._filter_is_build or keep_record:
+            return result, keep_record
+        return None, False
+
     def execute(
         self, input_data: list[dict], is_build: bool = False
     ) -> tuple[list[dict], float]:
@@ -46,55 +70,10 @@ def execute(
         Returns:
             tuple[list[dict], float]: A tuple containing the filtered list of dictionaries
             and the total cost of the operation.
-
-        This method performs the following steps:
-        1. Processes each input item using an LLM model
-        2. Validates the output
-        3. Filters the results based on the specified filter key
-        4. Calculates the total cost of the operation
-
-        The method uses multi-threading to process items in parallel, improving performance
-        for large datasets.
-
-        Usage:
-        ```python
-        from docetl.operations import FilterOperation
-
-        config = {
-            "prompt": "Determine if the following item is important: {{input}}",
-            "output": {
-                "schema": {"is_important": "bool"}
-            },
-            "model": "gpt-3.5-turbo"
-        }
-        filter_op = FilterOperation(config)
-        input_data = [
-            {"id": 1, "text": "Critical update"},
-            {"id": 2, "text": "Regular maintenance"}
-        ]
-        results, cost = filter_op.execute(input_data)
-        print(f"Filtered results: {results}")
-        print(f"Total cost: {cost}")
-        ```
         """
-        filter_key = next(
-            iter(
-                [
-                    k
-                    for k in self.config["output"]["schema"].keys()
-                    if k != "_short_explanation"
-                ]
-            )
-        )
-
-        results, total_cost = super().execute(input_data)
-
-        # Drop records with filter_key values that are False
-        if not is_build:
-            results = [result for result in results if result[filter_key]]
-
-        # Drop the filter_key from the results
-        for result in results:
-            result.pop(filter_key, None)
-
-        return results, total_cost
+        previous_state = self._filter_is_build
+        self._filter_is_build = is_build
+        try:
+            return super().execute(input_data)
+        finally:
+            self._filter_is_build = previous_state
diff --git a/docetl/operations/map.py b/docetl/operations/map.py
@@ -44,6 +44,7 @@ class schema(BaseOperation.schema):
         litellm_completion_kwargs: dict[str, Any] = {}
         pdf_url_key: str | None = None
         flush_partial_result: bool = False
+        limit: int | None = Field(None, gt=0)
         # Calibration parameters
         calibrate: bool = False
         num_calibration_docs: int = Field(10, gt=0)
@@ -152,6 +153,12 @@ def __init__(
             # Mark that we need to append document statement
             self.config["_append_document_to_batch_prompt"] = True
 
+    def _limit_applies_to_inputs(self) -> bool:
+        return True
+
+    def _handle_result(self, result: dict[str, Any]) -> tuple[dict | None, bool]:
+        return result, True
+
     def _generate_calibration_context(self, input_data: list[dict]) -> str:
         """
         Generate calibration context by running the operation on a sample of documents
@@ -272,17 +279,27 @@ def execute(self, input_data: list[dict]) -> tuple[list[dict], float]:
 
         The method uses parallel processing to improve performance.
         """
+        limit_value = self.config.get("limit")
+
         # Check if there's no prompt and only drop_keys
         if "prompt" not in self.config and "drop_keys" in self.config:
+            data_to_process = input_data
+            if limit_value is not None and self._limit_applies_to_inputs():
+                data_to_process = input_data[:limit_value]
             # If only drop_keys is specified, simply drop the keys and return
             dropped_results = []
-            for item in input_data:
+            for item in data_to_process:
                 new_item = {
                     k: v for k, v in item.items() if k not in self.config["drop_keys"]
                 }
                 dropped_results.append(new_item)
+                if limit_value is not None and len(dropped_results) >= limit_value:
+                    break
             return dropped_results, 0.0  # Return the modified data with no cost
 
+        if limit_value is not None and self._limit_applies_to_inputs():
+            input_data = input_data[:limit_value]
+
         # Generate calibration context if enabled
         calibration_context = ""
         if self.config.get("calibrate", False) and "prompt" in self.config:
@@ -512,40 +529,62 @@ def _process_map_batch(items: list[dict]) -> tuple[list[dict], float]:
 
             return all_results, total_cost
 
-        with ThreadPoolExecutor(max_workers=self.max_batch_size) as executor:
-            batch_size = self.max_batch_size if self.max_batch_size is not None else 1
-            futures = []
-            for i in range(0, len(input_data), batch_size):
-                batch = input_data[i : i + batch_size]
-                futures.append(executor.submit(_process_map_batch, batch))
-            results = []
-            total_cost = 0
-            pbar = RichLoopBar(
-                range(len(futures)),
-                desc=f"Processing {self.config['name']} (map) on all documents",
-                console=self.console,
-            )
-            for batch_index in pbar:
-                result_list, item_cost = futures[batch_index].result()
-                if result_list:
-                    if "drop_keys" in self.config:
-                        result_list = [
-                            {
-                                k: v
-                                for k, v in result.items()
-                                if k not in self.config["drop_keys"]
-                            }
-                            for result in result_list
-                        ]
-                    results.extend(result_list)
-                    # --- BEGIN: Flush partial checkpoint ---
-                    if self.config.get("flush_partial_results", False):
-                        op_name = self.config["name"]
-                        self.runner._flush_partial_results(
-                            op_name, batch_index, result_list
-                        )
-                    # --- END: Flush partial checkpoint ---
-                total_cost += item_cost
+        limit_counter = 0
+
+        batch_size = self.max_batch_size if self.max_batch_size is not None else 1
+        total_batches = (
+            (len(input_data) + batch_size - 1) // batch_size if input_data else 0
+        )
+        results: list[dict] = []
+        total_cost = 0.0
+        limit_reached = False
+
+        pbar = RichLoopBar(
+            range(total_batches),
+            desc=f"Processing {self.config['name']} (map) on all documents",
+            console=self.console,
+        )
+
+        for batch_index in pbar:
+            if limit_value is not None and limit_counter >= limit_value:
+                break
+
+            batch_start = batch_index * batch_size
+            batch = input_data[batch_start : batch_start + batch_size]
+            if not batch:
+                break
+
+            result_list, item_cost = _process_map_batch(batch)
+            total_cost += item_cost
+
+            if result_list:
+                if "drop_keys" in self.config:
+                    result_list = [
+                        {
+                            k: v
+                            for k, v in result.items()
+                            if k not in self.config["drop_keys"]
+                        }
+                        for result in result_list
+                    ]
+
+                if self.config.get("flush_partial_results", False):
+                    op_name = self.config["name"]
+                    self.runner._flush_partial_results(op_name, batch_index, result_list)
+
+                for result in result_list:
+                    processed_result, counts_towards_limit = self._handle_result(result)
+                    if processed_result is not None:
+                        results.append(processed_result)
+
+                    if limit_value is not None and counts_towards_limit:
+                        limit_counter += 1
+                        if limit_counter >= limit_value:
+                            limit_reached = True
+                            break
+
+            if limit_reached:
+                break
 
         if self.status:
             self.status.start()
diff --git a/docetl/operations/reduce.py b/docetl/operations/reduce.py
@@ -64,6 +64,7 @@ class schema(BaseOperation.schema):
         timeout: int | None = None
         litellm_completion_kwargs: dict[str, Any] = Field(default_factory=dict)
         enable_observability: bool = False
+        limit: int | None = Field(None, gt=0)
 
         @field_validator("prompt")
         def validate_prompt(cls, v):
@@ -282,6 +283,10 @@ def get_group_key(item):
             # Convert the grouped data to a list of tuples
             grouped_data = list(grouped_data.items())
 
+        limit_value = self.config.get("limit")
+        if limit_value is not None:
+            grouped_data = grouped_data[:limit_value]
+
         def process_group(
             key: tuple, group_elems: list[dict]
         ) -> tuple[dict | None, float]:
@@ -388,6 +393,9 @@ def process_group(
                 if output is not None:
                     results.append(output)
 
+        if limit_value is not None and len(results) > limit_value:
+            results = results[:limit_value]
+
         if self.config.get("persist_intermediates", False):
             for result in results:
                 key = tuple(result[k] for k in self.config["reduce_key"])
diff --git a/docs/operators/extract.md b/docs/operators/extract.md
@@ -140,6 +140,9 @@ This strategy asks the LLM to generate regex patterns matching the desired conte
 | `timeout` | Timeout for LLM calls in seconds | 120 |
 | `skip_on_error` | Continue processing if errors occur | false |
 | `litellm_completion_kwargs` | Additional parameters for LiteLLM calls | {} |
+| `limit` | Maximum number of documents to extract from before stopping | Processes all data |
+
+When `limit` is set, Extract only reformats and submits the first _N_ documents. This is handy when the upstream dataset is large and you want to cap cost while previewing results.
 
 ## Best Practices
 
diff --git a/docs/operators/filter.md b/docs/operators/filter.md
@@ -83,6 +83,10 @@ This example demonstrates how the Filter operation distinguishes between high-im
 
 See [map optional parameters](./map.md#optional-parameters) for additional configuration options, including `batch_prompt` and `max_batch_size`.
 
+### Limiting filtered outputs
+
+`limit` behaves slightly differently for filter operations than for map operations. Because filter drops documents whose predicate evaluates to `false`, the limit counts only the documents that would be retained (i.e., the ones whose boolean output is `true`). DocETL will continue evaluating additional inputs until it has collected `limit` passing documents and then stop scheduling further LLM calls. This ensures you can request “the first N matches” without paying to score the entire dataset.
+
 !!! info "Validation"
 
     For more details on validation techniques and implementation, see [operators](../concepts/operators.md#validation).
diff --git a/docs/operators/map.md b/docs/operators/map.md
@@ -140,6 +140,7 @@ This example demonstrates how the Map operation can transform long, unstructured
 | `optimize`                        | Flag to enable operation optimization                                                           | `True`                        |
 | `recursively_optimize`            | Flag to enable recursive optimization of operators synthesized as part of rewrite rules         | `false`                       |
 | `sample`                     | Number of samples to use for the operation                                                      | Processes all data            |
+| `limit`                      | Maximum number of outputs to produce before stopping                                            | Processes all data            |
 | `tools`                           | List of tool definitions for LLM use                                                            | None                          |
 | `validate`                        | List of Python expressions to validate the output                                               | None                          |
 | `flush_partial_results`           | Write results of individual batches of map operation to disk for faster inspection              | False  |
@@ -158,6 +159,10 @@ This example demonstrates how the Map operation can transform long, unstructured
 
 Note: If `drop_keys` is specified, `prompt` and `output` become optional parameters.
 
+### Limiting execution
+
+Set `limit` when you only need the first _N_ map results or want to cap LLM spend. The operation slices the processed dataset to the first `limit` entries and also stops scheduling new prompts once that many outputs have been produced, even if a prompt returns multiple records. Filter operations inherit this behavior but redefine the count so the limit only applies to records whose filter predicate evaluates to `true` (see [Filter](./filter.md#optional-parameters)).
+
 
 !!! info "Validation and Gleaning"
 
diff --git a/docs/operators/reduce.md b/docs/operators/reduce.md
@@ -52,6 +52,7 @@ This Reduce operation processes customer feedback grouped by department:
 | Parameter                 | Description                                                                                            | Default                     |
 | ------------------------- | ------------------------------------------------------------------------------------------------------ | --------------------------- |
 | `sample`                  | Number of samples to use for the operation                                                             | None                        |
+| `limit`                   | Maximum number of groups to process before stopping                                                    | All groups                  |
 | `synthesize_resolve`      | If false, won't synthesize a resolve operation between map and reduce                                  | true                        |
 | `model`                   | The language model to use                                                                              | Falls back to default_model |
 | `input`                   | Specifies the schema or keys to subselect from each item                                               | All keys from input items   |
@@ -67,6 +68,10 @@ This Reduce operation processes customer feedback grouped by department:
 | `litellm_completion_kwargs` | Additional parameters to pass to LiteLLM completion calls. | {}                          |
 | `bypass_cache` | If true, bypass the cache for this operation. | False                          |
 
+### Limiting group processing
+
+Set `limit` to short-circuit the reduce phase after the first _N_ groups have been aggregated. This is useful for previewing results or capping LLM usage when you only need the earliest groups (according to the original input order). Groups beyond the limit are never scheduled, so you avoid extra fold/merge calls. If a grouped reduce returns more than one record per group, the final output list is truncated to `limit`.
+
 ## Advanced Features
 
 ### Incremental Folding
diff --git a/tests/basic/test_basic_filter_split_gather.py b/tests/basic/test_basic_filter_split_gather.py
diff --git a/tests/basic/test_basic_map.py b/tests/basic/test_basic_map.py
diff --git a/tests/basic/test_basic_reduce_resolve.py b/tests/basic/test_basic_reduce_resolve.py