peteretelej · peteretelej · Apr 9, 2026 · Apr 8, 2026 · Apr 8, 2026 · Apr 8, 2026
diff --git a/README.md b/README.md
@@ -1,11 +1,9 @@
 # diffchunk
 
 [![CI](https://github.com/peteretelej/diffchunk/actions/workflows/ci.yml/badge.svg)](https://github.com/peteretelej/diffchunk/actions/workflows/ci.yml)
-[![codecov](https://codecov.io/gh/peteretelej/diffchunk/branch/main/graph/badge.svg)](https://codecov.io/gh/peteretelej/diffchunk)
 [![PyPI version](https://img.shields.io/pypi/v/diffchunk.svg)](https://pypi.org/project/diffchunk/)
 [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
 [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
-[![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
 [![uv](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/uv/main/assets/badge/v0.json)](https://github.com/astral-sh/uv)
 
 MCP server that enables LLMs to navigate large diff files efficiently. Instead of reading entire diffs sequentially, LLMs can jump directly to relevant changes using pattern-based navigation.
@@ -105,8 +103,9 @@ This lets your AI assistant handle massive diffs that would normally crash other
 
 ```python
 list_chunks("/tmp/changes.diff")
-# → 5 chunks across 12 files, 3,847 total lines
-# Each chunk includes file_details with per-file line counts
+# -> 5 chunks across 12 files, 3,847 total lines, ~15,420 tokens
+# Each chunk includes token_count and file_details with per-file line counts
+# Response includes total_token_count for context-budget planning
 ```
 
 **Target specific files:**
@@ -164,10 +163,44 @@ load_diff(
     "/tmp/large.diff",
     max_chunk_lines=2000,
     include_patterns="*.py,*.js",
-    exclude_patterns="*test*"
+    exclude_patterns="*test*",
+    context_lines=2
 )
 ```
 
+### Format Options
+
+Use the `format` parameter on `get_chunk` to transform output for LLM consumption:
+
+```python
+# Default - raw diff output
+get_chunk("/tmp/changes.diff", 1, format="raw")
+
+# Annotated - structured with line numbers, file headers, hunk separation
+get_chunk("/tmp/changes.diff", 1, format="annotated")
+
+# Compact - token-efficient, only new hunks (context + added lines)
+get_chunk("/tmp/changes.diff", 1, format="compact")
+```
+
+**Annotated format** adds `## File:` headers, `__new hunk__`/`__old hunk__` sections with new-file line numbers, and function context from `@@` headers.
+
+**Compact format** shows only what was added or kept, omitting removed lines and `__old hunk__` sections entirely. Useful when you only need to see the final state.
+
+### Context Reduction
+
+Use `context_lines` on `load_diff` to reduce context lines per hunk at load time:
+
+```python
+# Keep only 2 lines of context around each change
+load_diff("/tmp/large.diff", context_lines=2)
+
+# Keep only changes, no context
+load_diff("/tmp/large.diff", context_lines=0)
+```
+
+This composes with `format` - context is reduced at load time, then formatting is applied at display time.
+
 ## Supported Formats
 
 - Git diff output (`git diff`, `git show`)

diff --git a/docs/design.md b/docs/design.md
@@ -24,32 +24,37 @@ def load_diff(
     skip_generated: bool = True,
     include_patterns: Optional[str] = None,
     exclude_patterns: Optional[str] = None,
+    context_lines: Optional[int] = None,
 ) -> Dict[str, Any]
 ```
 
-**Returns:** `{"chunks": int, "files": int, "total_lines": int, "file_path": str}`
+**Returns:** `{"chunks": int, "files": int, "total_lines": int, "file_path": str, "files_excluded": int}`
 
 ### list_chunks (Auto-loading)
 
 ```python
-def list_chunks(absolute_file_path: str) -> List[Dict[str, Any]]
+def list_chunks(absolute_file_path: str) -> Dict[str, Any]
 ```
 
-**Returns:** Array of chunk metadata with files, line counts, summaries, and `file_details` (per-file line counts)
+**Returns:** Dictionary with `chunks` (array of chunk metadata with files, line counts, token counts, summaries, and `file_details`) and `total_token_count` (sum of all chunk token counts)
 
 ```json
-[
-  {
-    "chunk": 1,
-    "files": ["src/main.py", "src/utils.py"],
-    "file_details": [
-      {"path": "src/main.py", "lines": 120},
-      {"path": "src/utils.py", "lines": 45}
-    ],
-    "lines": 165,
-    "summary": "2 files, 165 lines"
-  }
-]
+{
+  "chunks": [
+    {
+      "chunk": 1,
+      "files": ["src/main.py", "src/utils.py"],
+      "file_details": [
+        {"path": "src/main.py", "lines": 120},
+        {"path": "src/utils.py", "lines": 45}
+      ],
+      "lines": 165,
+      "token_count": 412,
+      "summary": "2 files, 165 lines"
+    }
+  ],
+  "total_token_count": 412
+}
 ```
 
 ### get_chunk (Auto-loading)
@@ -58,7 +63,8 @@ def list_chunks(absolute_file_path: str) -> List[Dict[str, Any]]
 def get_chunk(
     absolute_file_path: str, 
     chunk_number: int, 
-    include_context: bool = True
+    include_context: bool = True,
+    format: str = "raw",
 ) -> str
 ```
 
@@ -111,6 +117,7 @@ class ChunkInfo:
     files: List[str]
     line_count: int
     summary: str
+    token_count: int = 0                                        # Estimated token count (len(content) // 4)
     parent_file: str | None = None
     sub_chunk_index: int | None = None
     file_details: List[Dict[str, Any]] = field(default_factory=list)  # [{"path": str, "lines": int}]
@@ -168,9 +175,10 @@ src/
 ├── main.py           # CLI entry point
 ├── server.py         # MCP server (FastMCP module-level tools)
 ├── tools.py          # MCP tools (DiffChunkTools)
-├── models.py         # Data models
-├── parser.py         # Diff parsing (DiffParser)
-└── chunker.py        # Chunking logic (DiffChunker)
+├── models.py         # Data models (DiffStats, FormatMode, etc.)
+├── parser.py         # Diff parsing (DiffParser) and context reduction
+├── chunker.py        # Chunking logic (DiffChunker)
+└── formatter.py      # Output formatting (annotated, compact)
 ```
 
 ## Resources
@@ -182,6 +190,73 @@ src/
 - Pattern matching (glob) is case-insensitive, matching macOS/Windows filesystem behavior
 - Both `find_chunks_for_files` and `get_file_diff` use case-insensitive comparison
 
+## Format Options
+
+### FormatMode Enum
+
+```python
+class FormatMode(str, Enum):
+    RAW = "raw"          # Default - unmodified diff output
+    ANNOTATED = "annotated"  # Structured with line numbers and hunk separation
+    COMPACT = "compact"      # Token-efficient, new hunks only
+```
+
+`FormatMode` inherits from `str, Enum` so values compare directly with strings.
+
+### `format` Parameter on `get_chunk`
+
+The `format` parameter is a display-time parameter on `get_chunk`. It transforms output for rendering but stored data always remains raw.
+
+```python
+def get_chunk(
+    absolute_file_path: str,
+    chunk_number: int,
+    include_context: bool = True,
+    format: str = "raw",
+) -> str
+```
+
+- `"raw"` (default) - returns the original diff content, identical to pre-feature behavior
+- `"annotated"` - structured output with `## File:` headers, `__new hunk__`/`__old hunk__` separation, new-file line numbers, and function context from `@@` headers
+- `"compact"` - token-efficient output showing only new hunks (context + added lines), omitting removed lines and `__old hunk__` sections
+
+Invalid format values raise `ValueError` listing valid options.
+
+### `context_lines` Parameter on `load_diff`
+
+A load-time parameter that reduces context lines per hunk before chunking. Implemented via `DiffParser.reduce_context()`.
+
+```python
+def load_diff(
+    absolute_file_path: str,
+    ...,
+    context_lines: Optional[int] = None,
+) -> Dict[str, Any]
+```
+
+- `None` (default) - keeps all context lines from the original diff
+- `0` - keeps only added/removed lines, no context
+- `N` - keeps up to N context lines before and after each change
+
+Overlapping context windows between nearby changes preserve shared context lines. Hunk headers are recalculated after reduction. Negative values raise `ValueError`.
+
+### `files_excluded` in `DiffStats`
+
+```python
+@dataclass
+class DiffStats:
+    total_files: int
+    total_lines: int
+    chunks_count: int
+    files_excluded: int = 0
+```
+
+When `exclude_patterns` is used with `load_diff`, the `files_excluded` count reports how many files were removed by the patterns. This count is included in the `load_diff` response.
+
+### Feature Composition
+
+`format` and `context_lines` compose correctly: `context_lines` reduces context at load time (stored in the session), then `format` transforms the already-reduced content at display time. Both can be used alongside `exclude_patterns`.
+
 ## Performance
 
 - Target: <1 second for 100k+ line diffs

diff --git a/scripts/pre-push b/scripts/pre-push
@@ -6,10 +6,10 @@ set -e
 echo "Running pre-push checks..."
 
 echo "Checking ruff lint..."
-uv run ruff check src/
+uv run ruff check .
 
 echo "Checking code formatting..."
-uv run ruff format --check src/
+uv run ruff format --check .
 
 echo "Running tests..."
 uv run pytest tests/ -x -q

diff --git a/src/chunker.py b/src/chunker.py
@@ -1,7 +1,7 @@
 """Diff chunking functionality."""
 
 import re
-from typing import Dict, List, Tuple
+from typing import Dict, List, Optional, Tuple
 from .models import DiffChunk, DiffSession
 from .parser import DiffParser
 
@@ -21,6 +21,7 @@ def chunk_diff(
         include_patterns: List[str] | None = None,
         exclude_patterns: List[str] | None = None,
         max_chunk_lines: int | None = None,
+        context_lines: Optional[int] = None,
     ) -> None:
         """Chunk a diff file into the session."""
         if max_chunk_lines is None:
@@ -38,9 +39,17 @@ def chunk_diff(
         except ValueError as e:
             raise ValueError(f"Failed to parse diff: {e}")
 
+        if context_lines is not None:
+            file_changes = [
+                (files, self.parser.reduce_context(content, context_lines))
+                for files, content in file_changes
+            ]
+
         if not file_changes:
             raise ValueError("Diff file parsed successfully but contains no changes")
 
+        files_excluded_count = 0
+
         for files, content in file_changes:
             # Apply filters
             if skip_trivial and self.parser.is_trivial_change(content):
@@ -52,6 +61,7 @@ def chunk_diff(
             if not self.parser.should_include_file(
                 files, include_patterns, exclude_patterns
             ):
+                files_excluded_count += 1
                 continue
 
             content_lines = self.parser.count_lines(content)
@@ -140,6 +150,7 @@ def chunk_diff(
 
         # Update session statistics
         session.update_stats()
+        session.stats.files_excluded = files_excluded_count
 
         if not session.chunks:
             raise ValueError(