Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Cache
__pycache__
.mypy_cache
.uv_cache
.pytest_cache
.ruff_cache
.uv-cache
Expand Down
2 changes: 1 addition & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ make format

### Type checking

Type checking is handled by [mypy](https://mypy.readthedocs.io/), verifying code against type annotations. Configuration settings can be found in `pyproject.toml`.
Type checking is handled by [ty](https://docs.astral.sh/ty/), verifying code against type annotations. Configuration settings can be found in `pyproject.toml`.

To run type checking:

Expand Down
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
E2E_TESTS_CONCURRENCY = 1

clean:
rm -rf .mypy_cache .pytest_cache .ruff_cache .uv-cache build dist htmlcov .coverage
rm -rf .uv_cache .pytest_cache .ruff_cache .uv-cache build dist htmlcov .coverage

install-sync:
uv sync --all-extras
Expand All @@ -27,7 +27,7 @@ lint:
uv run ruff check

type-check:
uv run mypy
uv run ty check

unit-tests:
uv run pytest \
Expand Down
3 changes: 1 addition & 2 deletions docs/deployment/code_examples/google/cloud_run_example.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
# mypy: disable-error-code="misc"
import json
import os

Expand All @@ -9,7 +8,7 @@
from crawlee.storage_clients import MemoryStorageClient


@get('/') # type: ignore[untyped-decorator]
@get('/')
async def main() -> str:
"""The crawler entry point that will be called when the HTTP endpoint is accessed."""
# highlight-start
Expand Down
3 changes: 1 addition & 2 deletions docs/deployment/code_examples/google/google_example.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
# mypy: disable-error-code="misc"
import asyncio
import json
from datetime import timedelta
Expand Down Expand Up @@ -48,7 +47,7 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
# highlight-end


@functions_framework.http # type: ignore[untyped-decorator]
@functions_framework.http
def crawlee_run(request: Request) -> Response:
# You can pass data to your crawler using `request`
function_id = request.headers['Function-Execution-Id']
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

async def main() -> None:
fingerprint_generator = DefaultFingerprintGenerator(
header_options=HeaderGeneratorOptions(browsers=['chromium']),
header_options=HeaderGeneratorOptions(browsers=['chrome']),
screen_options=ScreenOptions(min_width=400),
)

Expand Down
4 changes: 2 additions & 2 deletions docs/guides/code_examples/running_in_web_server/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
app = FastAPI(lifespan=lifespan, title='Crawler app')


@app.get('/', response_class=HTMLResponse) # type: ignore[untyped-decorator]
@app.get('/', response_class=HTMLResponse)
def index() -> str:
return """
<!DOCTYPE html>
Expand All @@ -32,7 +32,7 @@ def index() -> str:
"""


@app.get('/scrape') # type: ignore[untyped-decorator]
@app.get('/scrape')
async def scrape_url(request: Request, url: str | None = None) -> dict:
if not url:
return {'url': 'missing', 'scrape result': 'no results'}
Expand Down
57 changes: 12 additions & 45 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,6 @@ dev = [
"build<2.0.0", # For e2e tests.
"dycw-pytest-only<3.0.0",
"fakeredis[probabilistic,json,lua]<3.0.0",
"mypy~=1.19.0",
"pre-commit<5.0.0",
"proxy-py<3.0.0",
"pydoc-markdown<5.0.0",
Expand All @@ -113,6 +112,7 @@ dev = [
"pytest<9.0.0",
"ruff~=0.14.0",
"setuptools", # setuptools are used by pytest, but not explicitly required
"ty~=0.0.0",
"types-beautifulsoup4<5.0.0",
"types-cachetools<7.0.0",
"types-colorama<1.0.0",
Expand Down Expand Up @@ -230,57 +230,24 @@ filterwarnings = [
"ignore:websockets.server.WebSocketServerProtocol is deprecated:DeprecationWarning",
]

[tool.mypy]
python_version = "3.10"
plugins = ["pydantic.mypy"]
[tool.ty.environment]
python-version = "3.10"

[tool.ty.src]
include = ["src", "tests", "scripts", "docs", "website"]
exclude = [
"src/crawlee/project_template",
"docs/guides/code_examples/storage_clients/custom_storage_client_example.py",
]
files = ["src", "tests", "docs", "website"]
check_untyped_defs = true
disallow_incomplete_defs = true
disallow_untyped_calls = true
disallow_untyped_decorators = true
disallow_untyped_defs = true
no_implicit_optional = true
warn_redundant_casts = true
warn_return_any = true
warn_unreachable = true
warn_unused_ignores = true

[[tool.mypy.overrides]]
# Example codes are sometimes showing integration of crawlee with external tool, which is not dependency of crawlee.
module = [
"apify", # Example code shows integration of apify and crawlee.
"apify_fingerprint_datapoints", # Untyped and stubs not available
"camoufox", # Example code shows integration of camoufox and crawlee.
"fastapi", # Example code shows running in webserver.
"stagehand.*", # Example code shows integration of Stagehand and crawlee.
"starlette.*", # Example code shows running in webserver.
"flask", # Example code shows deploy on Google Cloud.
"functions_framework", # Example code shows deploy on Google Cloud.
"jaro", # Untyped and stubs not available
"litestar", # Example code shows deploy on Google Cloud Run.
"loguru", # Example code shows integration of loguru and crawlee for JSON logging.
"sklearn.linear_model", # Untyped and stubs not available
"cookiecutter.*", # Untyped and stubs not available
"inquirer.*", # Untyped and stubs not available
"warcio.*", # Example code shows WARC files creation.
"wrapt" # Untyped and stubs not available
]
ignore_missing_imports = true

[[tool.mypy.overrides]]
module = [
"running_in_web_server.*", # False positive when fastapi not available
[[tool.ty.overrides]]
include = [
"docs/**/*.py",
"website/**/*.py",
]
disable_error_code = ["misc"]

[tool.basedpyright]
pythonVersion = "3.10"
typeCheckingMode = "standard"
include = ["src", "tests", "docs", "website"]
[tool.ty.overrides.rules]
unresolved-import = "ignore"

[tool.coverage.report]
exclude_lines = ["pragma: no cover", "if TYPE_CHECKING:", "assert_never()"]
Expand Down
2 changes: 1 addition & 1 deletion src/crawlee/_browserforge_workaround.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def patch_browserforge() -> None:
def DownloadIfNotExists(**flags: bool) -> None:
pass

download.DownloadIfNotExists = DownloadIfNotExists
download.DownloadIfNotExists = DownloadIfNotExists # ty: ignore[invalid-assignment]

import browserforge.bayesian_network

Expand Down
4 changes: 2 additions & 2 deletions src/crawlee/_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def __setitem__(self, key: str, value: JsonSerializable) -> None:
def __delitem__(self, key: str) -> None:
del self.__pydantic_extra__[key]

def __iter__(self) -> Iterator[str]: # type: ignore[override]
def __iter__(self) -> Iterator[str]: # ty: ignore[invalid-method-override]
yield from self.__pydantic_extra__

def __len__(self) -> int:
Expand Down Expand Up @@ -195,7 +195,7 @@ class Request(BaseModel):
] = None
"""HTTP request payload."""

# Workaround for pydantic 2.12 and mypy type checking issue for Annotated with default_factory
# Workaround for Pydantic and type checkers when using Annotated with default_factory
if TYPE_CHECKING:
headers: HttpHeaders = HttpHeaders()
"""HTTP request headers."""
Expand Down
27 changes: 13 additions & 14 deletions src/crawlee/_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,14 +62,14 @@ class HttpHeaders(RootModel, Mapping[str, str]):

model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)

# Workaround for pydantic 2.12 and mypy type checking issue for Annotated with default_factory
# Workaround for Pydantic and type checkers when using Annotated with default_factory
if TYPE_CHECKING:
root: dict[str, str] = {}
else:
root: Annotated[
dict[str, str],
PlainValidator(lambda value: _normalize_headers(value)),
Field(default_factory=dict),
Field(default_factory=lambda: dict[str, str]()),
]

def __getitem__(self, key: str) -> str:
Expand All @@ -91,7 +91,7 @@ def __ror__(self, other: HttpHeaders) -> HttpHeaders:
combined_headers = {**other, **self.root}
return HttpHeaders(combined_headers)

def __iter__(self) -> Iterator[str]: # type: ignore[override]
def __iter__(self) -> Iterator[str]: # ty: ignore[invalid-method-override]
yield from self.root

def __len__(self) -> int:
Expand Down Expand Up @@ -671,17 +671,16 @@ def create_modified_copy(
get_key_value_store: GetKeyValueStoreFromRequestHandlerFunction | None = None,
) -> Self:
"""Create a modified copy of the crawling context with specified changes."""
original_fields = {field.name: getattr(self, field.name) for field in dataclasses.fields(self)}
modified_fields = {
key: value
for key, value in {
'push_data': push_data,
'add_requests': add_requests,
'get_key_value_store': get_key_value_store,
}.items()
if value
}
return self.__class__(**{**original_fields, **modified_fields})
modifications = dict[str, Any]()

if push_data is not None:
modifications['push_data'] = push_data
if add_requests is not None:
modifications['add_requests'] = add_requests
if get_key_value_store is not None:
modifications['get_key_value_store'] = get_key_value_store

return dataclasses.replace(self, **modifications)


class GetDataKwargs(TypedDict):
Expand Down
2 changes: 1 addition & 1 deletion src/crawlee/_utils/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,4 @@ async def async_wrapper(self: Any, *args: Any, **kwargs: Any) -> Any:

return await method(self, *args, **kwargs)

return async_wrapper if inspect.iscoroutinefunction(method) else sync_wrapper # type: ignore[return-value]
return async_wrapper if inspect.iscoroutinefunction(method) else sync_wrapper # ty: ignore[invalid-return-type]
2 changes: 1 addition & 1 deletion src/crawlee/_utils/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ async def export_csv_to_stream(
if 'lineterminator' not in kwargs:
kwargs['lineterminator'] = '\n'

writer = csv.writer(dst, **kwargs) # type: ignore[arg-type]
writer = csv.writer(dst, **kwargs)
write_header = True

# Iterate over the dataset and write to CSV.
Expand Down
2 changes: 1 addition & 1 deletion src/crawlee/_utils/globs.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def _translate(
if not seps:
seps = (os.path.sep, os.path.altsep) if os.path.altsep else os.path.sep

escaped_seps = ''.join(map(re.escape, seps))
escaped_seps = ''.join(map(re.escape, seps)) # ty: ignore[invalid-argument-type]
any_sep = f'[{escaped_seps}]' if len(seps) > 1 else escaped_seps
not_sep = f'[^{escaped_seps}]'
if include_hidden:
Expand Down
4 changes: 2 additions & 2 deletions src/crawlee/_utils/recurring_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class RecurringTask:
"""

def __init__(self, func: Callable, delay: timedelta) -> None:
logger.debug(f'Calling RecurringTask.__init__(func={func.__name__}, delay={delay})...')
logger.debug(f'Calling RecurringTask.__init__(func={func.__name__}, delay={delay})...') # ty: ignore[unresolved-attribute]
self.func = func
self.delay = delay
self.task: asyncio.Task | None = None
Expand Down Expand Up @@ -55,7 +55,7 @@ async def _wrapper(self) -> None:

def start(self) -> None:
"""Start the recurring task execution."""
self.task = asyncio.create_task(self._wrapper(), name=f'Task-recurring-{self.func.__name__}')
self.task = asyncio.create_task(self._wrapper(), name=f'Task-recurring-{self.func.__name__}') # ty: ignore[possibly-missing-attribute]

async def stop(self) -> None:
"""Stop the recurring task execution."""
Expand Down
10 changes: 5 additions & 5 deletions src/crawlee/_utils/sitemap.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,10 +430,10 @@ async def parse_sitemap(
up to the specified maximum depth.
"""
# Set default options
options = options or {}
emit_nested_sitemaps = options.get('emit_nested_sitemaps', False)
max_depth = options.get('max_depth', float('inf'))
sitemap_retries = options.get('sitemap_retries', 3)
options = options or {} # ty: ignore[invalid-assignment]
emit_nested_sitemaps = options.get('emit_nested_sitemaps', False) # ty: ignore[possibly-missing-attribute]
max_depth = options.get('max_depth', float('inf')) # ty: ignore[possibly-missing-attribute]
sitemap_retries = options.get('sitemap_retries', 3) # ty: ignore[possibly-missing-attribute]

# Setup working state
sources = list(initial_sources)
Expand Down Expand Up @@ -472,7 +472,7 @@ async def parse_sitemap(
sitemap_retries,
emit_nested_sitemaps=emit_nested_sitemaps,
proxy_info=proxy_info,
timeout=options.get('timeout', timedelta(seconds=30)),
timeout=options.get('timeout', timedelta(seconds=30)), # ty: ignore[possibly-missing-attribute]
):
yield result
else:
Expand Down
38 changes: 27 additions & 11 deletions src/crawlee/_utils/system.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from contextlib import suppress
from datetime import datetime, timezone
from logging import getLogger
from typing import Annotated
from typing import TYPE_CHECKING, Annotated

import psutil
from pydantic import BaseModel, ConfigDict, Field, PlainSerializer, PlainValidator
Expand Down Expand Up @@ -41,11 +41,19 @@ class CpuInfo(BaseModel):
used_ratio: Annotated[float, Field(alias='usedRatio')]
"""The ratio of CPU currently in use, represented as a float between 0 and 1."""

created_at: datetime = Field(
alias='createdAt',
default_factory=lambda: datetime.now(timezone.utc),
)
"""The time at which the measurement was taken."""
# Workaround for Pydantic and type checkers when using Annotated with default_factory
if TYPE_CHECKING:
created_at: datetime = datetime.now(timezone.utc)
"""The time at which the measurement was taken."""
else:
created_at: Annotated[
datetime,
Field(
alias='createdAt',
default_factory=lambda: datetime.now(timezone.utc),
),
]
"""The time at which the measurement was taken."""


class MemoryUsageInfo(BaseModel):
Expand All @@ -61,11 +69,19 @@ class MemoryUsageInfo(BaseModel):
]
"""Memory usage of the current Python process and its children."""

created_at: datetime = Field(
alias='createdAt',
default_factory=lambda: datetime.now(timezone.utc),
)
"""The time at which the measurement was taken."""
# Workaround for Pydantic and type checkers when using Annotated with default_factory
if TYPE_CHECKING:
created_at: datetime = datetime.now(timezone.utc)
"""The time at which the measurement was taken."""
else:
created_at: Annotated[
datetime,
Field(
alias='createdAt',
default_factory=lambda: datetime.now(timezone.utc),
),
]
"""The time at which the measurement was taken."""


class MemoryInfo(MemoryUsageInfo):
Expand Down
2 changes: 1 addition & 1 deletion src/crawlee/browsers/_browser_pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ def with_default_plugin(
plugin_options['browser_new_context_options'] = browser_new_context_options or {}

if headless is not None:
plugin_options['browser_launch_options']['headless'] = headless
plugin_options['browser_launch_options']['headless'] = headless # ty: ignore[invalid-assignment]

if use_incognito_pages is not None:
plugin_options['use_incognito_pages'] = use_incognito_pages
Expand Down
2 changes: 1 addition & 1 deletion src/crawlee/browsers/_playwright_browser.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ async def new_context(self, **context_options: Any) -> BrowserContext:

async def _delete_temp_dir(self, _: BrowserContext | None) -> None:
if self._temp_dir and self._temp_dir.exists():
await asyncio.to_thread(shutil.rmtree, self._temp_dir, ignore_errors=True)
await asyncio.to_thread(lambda: shutil.rmtree(self._temp_dir, ignore_errors=True)) # ty: ignore[invalid-argument-type]

@override
async def close(self, **kwargs: Any) -> None:
Expand Down
Loading