diff --git a/app/db/queries.py b/app/db/queries.py index dfd9f08..0651945 100644 --- a/app/db/queries.py +++ b/app/db/queries.py @@ -78,6 +78,16 @@ async def build_conditions( conditions.append(f"amino_acids >= ${param_idx}") query_params[param_name] = params.sequence_length + if params.curation_status is not None: + column_conditions = [] + for value in params.curation_status: + param_idx += 1 + param_name = f"param_{param_idx}" + column_conditions.append(f"LOWER(curation_status) = LOWER(${param_idx})") + query_params[param_name] = value + if column_conditions: + conditions.append(f"({' OR '.join(column_conditions)})") + # Combine all conditions with AND logic where_clause = " AND ".join(conditions) if conditions else "TRUE" @@ -151,41 +161,239 @@ async def get_total_count(db: Database, params: CLEANSearchQueryParams) -> int: result = await db.fetchval(query, *query_args) return result +def _has_search_context(params: CLEANTypeaheadQueryParams) -> bool: + """Check if any search context filters are provided.""" + return any([ + params.accession, + params.organism, + params.protein_name, + params.gene_name, + params.uniprot_id, + params.clean_ec_number, + params.curation_status, + params.clean_ec_confidence_min, + params.clean_ec_confidence_max, + params.sequence_length, + ]) + + +def _build_typeahead_context_conditions( + params: CLEANTypeaheadQueryParams, + start_param_idx: int = 1, +) -> Tuple[str, Dict[str, Any], int]: + """Build SQL conditions from typeahead context parameters.""" + conditions = [] + query_params = {} + param_idx = start_param_idx + + # Process string filters (case-insensitive exact matches with OR logic within columns) + string_columns = { + "accession": params.accession, + "protein_name": params.protein_name, + "organism": params.organism, + "gene_name": params.gene_name, + "uniprot_id": params.uniprot_id, + } + + for column, values in string_columns.items(): + if values: + column_conditions = [] + for value in values: + param_idx += 1 + param_name = f"param_{param_idx}" + if column == "accession": + column_conditions.append(f"pua.{column} = UPPER(${param_idx})") + else: + column_conditions.append(f"LOWER(pua.{column}) = LOWER(${param_idx})") + query_params[param_name] = value + if column_conditions: + conditions.append(f"({' OR '.join(column_conditions)})") + + if params.clean_ec_number is not None: + column_conditions = [] + for value in params.clean_ec_number: + param_idx += 1 + param_name = f"param_{param_idx}" + if value.endswith("-"): + column_conditions.append(f"puace.clean_ec_number LIKE ${param_idx}") + query_params[param_name] = re.sub(r'-.*$', '%', value) + else: + column_conditions.append(f"puace.clean_ec_number = ${param_idx}") + query_params[param_name] = value + if column_conditions: + conditions.append(f"({' OR '.join(column_conditions)})") + + if params.clean_ec_confidence_min is not None: + param_idx += 1 + param_name = f"param_{param_idx}" + conditions.append(f"puace.clean_ec_confidence > ${param_idx}") + query_params[param_name] = params.clean_ec_confidence_min + + if params.clean_ec_confidence_max is not None: + param_idx += 1 + param_name = f"param_{param_idx}" + conditions.append(f"puace.clean_ec_confidence < ${param_idx}") + query_params[param_name] = params.clean_ec_confidence_max + + if params.sequence_length is not None: + param_idx += 1 + param_name = f"param_{param_idx}" + conditions.append(f"pua.amino_acids >= ${param_idx}") + query_params[param_name] = params.sequence_length + + if params.curation_status is not None: + column_conditions = [] + for value in params.curation_status: + param_idx += 1 + param_name = f"param_{param_idx}" + column_conditions.append(f"LOWER(pua.curation_status) = LOWER(${param_idx})") + query_params[param_name] = value + if column_conditions: + conditions.append(f"({' OR '.join(column_conditions)})") + + where_clause = " AND ".join(conditions) if conditions else "TRUE" + return where_clause, query_params, param_idx + + async def get_typeahead_suggestions(db: Database, params: CLEANTypeaheadQueryParams -) -> List[str]: - """Get typeahead suggestions based on the query parameters.""" +) -> Tuple[List[str], int]: + """Get typeahead suggestions based on the query parameters. + + Returns a tuple of (matches, total_count). + """ search = params.search.strip() if len(search) < 3: raise ValueError("Search term must be at least 3 characters long.") - if params.field_name == 'accession': - # match the beginning of the string - # accessions are stored and indexed in uppercase - search += '%' - query = f"""SELECT DISTINCT accession FROM cleandb.predictions_uniprot_annot WHERE accession LIKE UPPER($1) ORDER BY 1 ASC""" - elif params.field_name == 'organism': - search = '%' + search + '%' - # match any part of the string - query = f"""SELECT DISTINCT organism FROM cleandb.predictions_uniprot_annot_mv01 WHERE organism_lower LIKE LOWER($1) ORDER BY 1 ASC""" - elif params.field_name == 'protein_name': - # match any part of the string - search = '%' + search + '%' - query = f"""SELECT DISTINCT protein_name FROM cleandb.predictions_uniprot_annot_mv02 WHERE protein_name_lower LIKE LOWER($1) ORDER BY 1 ASC""" - elif params.field_name == 'gene_name': - # match any part of the string (note we have gene names that start with an apostrophe, for example, which the user might not expect) - search = '%' + search + '%' - query = f"""SELECT DISTINCT gene_name FROM cleandb.predictions_uniprot_annot_mv03 WHERE gene_name_lower LIKE LOWER($1) ORDER BY 1 ASC""" - elif params.field_name == 'uniprot_id': - search = '%' + search + '%' - query = f"""SELECT DISTINCT uniprot_id FROM cleandb.predictions_uniprot_annot WHERE LOWER(uniprot_id) LIKE LOWER($1) ORDER BY 1 ASC""" - else: + limit = params.limit or 20 + offset = params.offset or 0 + has_context = _has_search_context(params) + + # Field-specific configuration + field_config = { + 'accession': { + 'search_pattern': lambda s: s + '%', # match beginning + 'search_condition': 'pua.accession LIKE UPPER($1)', + 'mv_table': None, + 'mv_search_condition': None, + 'column': 'accession', + 'result_column': 'accession', + }, + 'organism': { + 'search_pattern': lambda s: '%' + s + '%', # match anywhere + 'search_condition': 'LOWER(pua.organism) LIKE LOWER($1)', + 'mv_table': 'cleandb.predictions_uniprot_annot_mv01', + 'mv_search_condition': 'organism_lower LIKE LOWER($1)', + 'column': 'organism', + 'result_column': 'organism', + }, + 'protein_name': { + 'search_pattern': lambda s: '%' + s + '%', + 'search_condition': 'LOWER(pua.protein_name) LIKE LOWER($1)', + 'mv_table': 'cleandb.predictions_uniprot_annot_mv02', + 'mv_search_condition': 'protein_name_lower LIKE LOWER($1)', + 'column': 'protein_name', + 'result_column': 'protein_name', + }, + 'gene_name': { + 'search_pattern': lambda s: '%' + s + '%', + 'search_condition': 'LOWER(pua.gene_name) LIKE LOWER($1)', + 'mv_table': 'cleandb.predictions_uniprot_annot_mv03', + 'mv_search_condition': 'gene_name_lower LIKE LOWER($1)', + 'column': 'gene_name', + 'result_column': 'gene_name', + }, + 'uniprot_id': { + 'search_pattern': lambda s: '%' + s + '%', + 'search_condition': 'LOWER(pua.uniprot_id) LIKE LOWER($1)', + 'mv_table': None, + 'mv_search_condition': None, + 'column': 'uniprot_id', + 'result_column': 'uniprot_id', + }, + 'predicted_ec': { + 'search_pattern': lambda s: s + '%', # match beginning of EC number + 'search_condition': 'puace.clean_ec_number LIKE $1', + 'mv_table': None, + 'mv_search_condition': None, + 'column': 'clean_ec_number', + 'result_column': 'clean_ec_number', + }, + } + + if params.field_name not in field_config: raise ValueError(f"Invalid field name: {params.field_name}") - query += f" LIMIT {params.limit or 10}" + config = field_config[params.field_name] + search_term = config['search_pattern'](search) + + if not has_context: + # No search context - use materialized views for better performance when available + if params.field_name == 'predicted_ec': + # Query the EC table directly + count_query = f"""SELECT COUNT(DISTINCT clean_ec_number) FROM cleandb.predictions_uniprot_annot_clean_ec WHERE clean_ec_number LIKE $1""" + data_query = f"""SELECT DISTINCT clean_ec_number FROM cleandb.predictions_uniprot_annot_clean_ec WHERE clean_ec_number LIKE $1 ORDER BY 1 ASC LIMIT {limit} OFFSET {offset}""" + elif config['mv_table']: + # Use materialized view + count_query = f"""SELECT COUNT(DISTINCT {config['column']}) FROM {config['mv_table']} WHERE {config['mv_search_condition']}""" + data_query = f"""SELECT DISTINCT {config['column']} FROM {config['mv_table']} WHERE {config['mv_search_condition']} ORDER BY 1 ASC LIMIT {limit} OFFSET {offset}""" + else: + # Query main table directly + count_query = f"""SELECT COUNT(DISTINCT {config['column']}) FROM cleandb.predictions_uniprot_annot pua WHERE {config['search_condition']}""" + data_query = f"""SELECT DISTINCT {config['column']} FROM cleandb.predictions_uniprot_annot pua WHERE {config['search_condition']} ORDER BY 1 ASC LIMIT {limit} OFFSET {offset}""" + + total = await db.fetchval(count_query, search_term) + records = await db.fetch(data_query, search_term) + return [record[config['result_column']] for record in records], total - # Execute the query - records = await db.fetch(query, search) - return [record[params.field_name] for record in records] + else: + # Has search context - need to join with main table and apply filters + context_where, context_params, param_idx = _build_typeahead_context_conditions(params, start_param_idx=1) + + # The search term will be $1, context params start from $2 + # Rebuild context conditions with offset + context_where, context_params, _ = _build_typeahead_context_conditions(params, start_param_idx=1) + + # Build the query based on field type + if params.field_name == 'predicted_ec': + # Need to join with EC table + base_query = f""" + FROM cleandb.predictions_uniprot_annot pua + INNER JOIN cleandb.predictions_uniprot_annot_clean_ec puace + ON puace.predictions_uniprot_annot_id = pua.predictions_uniprot_annot_id + WHERE puace.clean_ec_number LIKE $1 + AND {context_where} + """ + select_column = "puace.clean_ec_number" + else: + # Check if we need to join with EC table for context filtering + needs_ec_join = params.clean_ec_number is not None or params.clean_ec_confidence_min is not None or params.clean_ec_confidence_max is not None + + if needs_ec_join: + base_query = f""" + FROM cleandb.predictions_uniprot_annot pua + INNER JOIN cleandb.predictions_uniprot_annot_clean_ec puace + ON puace.predictions_uniprot_annot_id = pua.predictions_uniprot_annot_id + WHERE {config['search_condition']} + AND {context_where} + """ + else: + base_query = f""" + FROM cleandb.predictions_uniprot_annot pua + WHERE {config['search_condition']} + AND {context_where} + """ + select_column = f"pua.{config['column']}" + + count_query = f"SELECT COUNT(*) FROM (SELECT DISTINCT {select_column} {base_query}) sub" + data_query = f"SELECT DISTINCT {select_column} {base_query} ORDER BY 1 ASC LIMIT {limit} OFFSET {offset}" + + # Build query args: search_term first, then context params + query_args = [search_term] + list(context_params.values()) + + total = await db.fetchval(count_query, *query_args) + records = await db.fetch(data_query, *query_args) + return [record[config['result_column']] for record in records], total async def get_ec_suggestions(db: Database, params: CLEANECLookupQueryParams ) -> List[Dict[str, str]]: diff --git a/app/models/clean_data.py b/app/models/clean_data.py index 861b37d..c96ffa7 100644 --- a/app/models/clean_data.py +++ b/app/models/clean_data.py @@ -112,7 +112,7 @@ class CLEANSearchResponse(BaseModel): class CLEANTypeaheadResponse(BaseModel): """Model for the response of a CLEAN typeahead query.""" - field_name: Literal['accession', 'organism', 'protein_name', 'gene_name', 'uniprot_id'] = Field( + field_name: Literal['accession', 'organism', 'protein_name', 'gene_name', 'uniprot_id', 'predicted_ec'] = Field( 'organism', description="Which field to search in", ), @@ -125,6 +125,30 @@ class CLEANTypeaheadResponse(BaseModel): [], description="List of results matching the search term." ) + search_context: Optional[dict] = Field( + None, + description="The search context filters that were applied to the typeahead query." + ) + total: int = Field( + 0, + description="Total number of matching results (before pagination)." + ) + limit: int = Field( + 20, + description="Maximum number of results returned." + ) + offset: int = Field( + 0, + description="Number of results skipped." + ) + next: Optional[str] = Field( + None, + description="Link to the next page of results." + ) + previous: Optional[str] = Field( + None, + description="Link to the previous page of results." + ) class CLEANECLookupMatch(BaseModel): """Model for a single match in the CLEAN EC lookup response.""" @@ -145,4 +169,24 @@ class CLEANECLookupResponse(BaseModel): matches: List[CLEANECLookupMatch] = Field( [], description="List of matches for the EC lookup." + ) + + +class CurationStatusOption(BaseModel): + """Model for a curation status option.""" + value: str = Field( + ..., + description="The value of the curation status (e.g., 'reviewed', 'unreviewed')." + ) + label: str = Field( + ..., + description="The human-readable label for the curation status." + ) + + +class CLEANCurationStatusResponse(BaseModel): + """Model for the response of the curation statuses endpoint.""" + statuses: List[CurationStatusOption] = Field( + [], + description="List of available curation status options." ) \ No newline at end of file diff --git a/app/models/query_params.py b/app/models/query_params.py index f9b33ea..ab8c950 100644 --- a/app/models/query_params.py +++ b/app/models/query_params.py @@ -51,6 +51,9 @@ class CLEANSearchQueryParams(BaseModel): sequence_length: Optional[str] = Field( None, description="Minimum sequence length" ) + curation_status: Optional[List[str]] = Field( + None, description="Curation status filter (reviewed/unreviewed), case-insensitive exact match (multiple values allowed, OR logic)" + ) # Numeric range filters (removed as requested) @@ -68,7 +71,7 @@ class CLEANSearchQueryParams(BaseModel): class CLEANTypeaheadQueryParams(BaseModel): """Query parameters for CLEAN typeahead suggestions.""" - field_name: Literal['accession', 'organism', 'protein_name', 'gene_name', 'uniprot_id'] = Field( + field_name: Literal['accession', 'organism', 'protein_name', 'gene_name', 'uniprot_id', 'predicted_ec'] = Field( 'organism', description="Which field to search in", ) @@ -80,6 +83,41 @@ class CLEANTypeaheadQueryParams(BaseModel): limit: Optional[int] = Field( None, description="Maximum number of records to return" ) + offset: Optional[int] = Field( + 0, description="Number of records to skip for pagination" + ) + + # Search context fields - when provided, typeahead results are filtered to match the current search context + accession: Optional[List[str]] = Field( + None, description="Filter typeahead results by accession" + ) + organism: Optional[List[str]] = Field( + None, description="Filter typeahead results by organism" + ) + protein_name: Optional[List[str]] = Field( + None, description="Filter typeahead results by protein name" + ) + gene_name: Optional[List[str]] = Field( + None, description="Filter typeahead results by gene name" + ) + uniprot_id: Optional[List[str]] = Field( + None, description="Filter typeahead results by uniprot ID" + ) + clean_ec_number: Optional[List[str]] = Field( + None, description="Filter typeahead results by CLEAN EC number" + ) + curation_status: Optional[List[str]] = Field( + None, description="Filter typeahead results by curation status" + ) + clean_ec_confidence_min: Optional[float] = Field( + None, description="Filter typeahead results by minimum CLEAN EC confidence" + ) + clean_ec_confidence_max: Optional[float] = Field( + None, description="Filter typeahead results by maximum CLEAN EC confidence" + ) + sequence_length: Optional[str] = Field( + None, description="Filter typeahead results by minimum sequence length" + ) class CLEANECLookupQueryParams(BaseModel): """Query parameters for CLEAN EC lookup.""" diff --git a/app/routers/search.py b/app/routers/search.py index 9e0089b..66cdac4 100644 --- a/app/routers/search.py +++ b/app/routers/search.py @@ -11,7 +11,7 @@ from app.db.database import Database, get_db from app.db.queries import get_ec_suggestions, get_filtered_data, get_total_count, get_typeahead_suggestions from app.models.query_params import CLEANECLookupQueryParams, CLEANSearchQueryParams, CLEANTypeaheadQueryParams, ResponseFormat -from app.models.clean_data import CLEANDataBase, CLEANECLookupResponse, CLEANECLookupMatch, CLEANSearchResponse, CLEANTypeaheadResponse +from app.models.clean_data import CLEANDataBase, CLEANECLookupResponse, CLEANECLookupMatch, CLEANSearchResponse, CLEANTypeaheadResponse, CurationStatusOption, CLEANCurationStatusResponse router = APIRouter(tags=["Search"]) @@ -42,6 +42,10 @@ def parse_query_params( None, description="Uniprot ID" ), + curation_status: Optional[List[str]] = Query( + None, + description="Curation status (reviewed/unreviewed)" + ), # Additional filters clean_ec_confidence_min: Optional[float] = Query( None, description="Minimum confidence for CLEAN predicted EC number" @@ -77,6 +81,7 @@ def parse_query_params( clean_ec_confidence_max = clean_ec_confidence_max, sequence_length = sequence_length, uniprot_id = uniprot, + curation_status=curation_status, format=format, limit=limit, offset=offset, @@ -228,7 +233,7 @@ async def get_data( raise HTTPException(status_code=500, detail=f"Error retrieving data: {str(e)}") def parse_typeahead_params( - field_name: Literal['accession', 'organism', 'protein_name', 'gene_name', 'uniprot_id'] = Query( + field_name: Literal['accession', 'organism', 'protein_name', 'gene_name', 'uniprot_id', 'predicted_ec'] = Query( 'organism', description="Which field to search in", ), @@ -236,13 +241,62 @@ def parse_typeahead_params( None, min_length=3, description="Search term for typeahead suggestions (minimum 3 characters)" - ) + ), + limit: Optional[int] = Query( + 20, description="Maximum number of records to return" + ), + offset: Optional[int] = Query( + 0, description="Number of records to skip" + ), + # Search context filters + accession: Optional[List[str]] = Query( + None, description="Filter typeahead results by accession" + ), + organism: Optional[List[str]] = Query( + None, description="Filter typeahead results by organism" + ), + protein: Optional[List[str]] = Query( + None, description="Filter typeahead results by protein name" + ), + gene_name: Optional[List[str]] = Query( + None, description="Filter typeahead results by gene name" + ), + ec_number: Optional[List[str]] = Query( + None, description="Filter typeahead results by CLEAN EC number" + ), + uniprot: Optional[List[str]] = Query( + None, description="Filter typeahead results by uniprot ID" + ), + curation_status: Optional[List[str]] = Query( + None, description="Filter typeahead results by curation status" + ), + clean_ec_confidence_min: Optional[float] = Query( + None, description="Filter typeahead results by minimum CLEAN EC confidence" + ), + clean_ec_confidence_max: Optional[float] = Query( + None, description="Filter typeahead results by maximum CLEAN EC confidence" + ), + sequence_length: Optional[str] = Query( + None, description="Filter typeahead results by minimum sequence length" + ), ) -> CLEANTypeaheadQueryParams: """Parse and validate query parameters.""" try: return CLEANTypeaheadQueryParams( field_name=field_name, search=search, + limit=limit, + offset=offset, + accession=accession, + organism=organism, + protein_name=protein, + gene_name=gene_name, + clean_ec_number=ec_number, + uniprot_id=uniprot, + curation_status=curation_status, + clean_ec_confidence_min=clean_ec_confidence_min, + clean_ec_confidence_max=clean_ec_confidence_max, + sequence_length=sequence_length, ) except Exception as e: logger.error(f"Error parsing query parameters: {e}") @@ -250,6 +304,33 @@ def parse_typeahead_params( status_code=400, detail=f"Invalid query parameters: {str(e)}" ) + +def _build_search_context(params: CLEANTypeaheadQueryParams) -> Optional[dict]: + """Build the search context dict from non-None context params.""" + context = {} + if params.accession: + context['accession'] = params.accession + if params.organism: + context['organism'] = params.organism + if params.protein_name: + context['protein_name'] = params.protein_name + if params.gene_name: + context['gene_name'] = params.gene_name + if params.clean_ec_number: + context['ec_number'] = params.clean_ec_number + if params.uniprot_id: + context['uniprot'] = params.uniprot_id + if params.curation_status: + context['curation_status'] = params.curation_status + if params.clean_ec_confidence_min is not None: + context['clean_ec_confidence_min'] = params.clean_ec_confidence_min + if params.clean_ec_confidence_max is not None: + context['clean_ec_confidence_max'] = params.clean_ec_confidence_max + if params.sequence_length: + context['sequence_length'] = params.sequence_length + return context if context else None + + @router.get("/typeahead", summary="Get typeahead suggestions for searching the database of predicted EC numbers.") async def get_typeahead( params: CLEANTypeaheadQueryParams = Depends(parse_typeahead_params), @@ -261,20 +342,92 @@ async def get_typeahead( """ try: - params.limit = 20 + limit = params.limit or 20 + offset = params.offset or 0 + # Get data from database - data = await get_typeahead_suggestions(db, params) + matches, total = await get_typeahead_suggestions(db, params) + + # Build search context + search_context = _build_search_context(params) + + # Build pagination URLs + next_url = None + previous_url = None + + if request: + base_url = str(request.url).split("?")[0] + + # Build base query params (excluding pagination) + base_params = { + "field_name": params.field_name, + "search": params.search, + "limit": limit, + } + + # Add context params if present + if params.accession: + base_params["accession"] = params.accession + if params.organism: + base_params["organism"] = params.organism + if params.protein_name: + base_params["protein"] = params.protein_name + if params.gene_name: + base_params["gene_name"] = params.gene_name + if params.clean_ec_number: + base_params["ec_number"] = params.clean_ec_number + if params.uniprot_id: + base_params["uniprot"] = params.uniprot_id + if params.curation_status: + base_params["curation_status"] = params.curation_status + if params.clean_ec_confidence_min is not None: + base_params["clean_ec_confidence_min"] = params.clean_ec_confidence_min + if params.clean_ec_confidence_max is not None: + base_params["clean_ec_confidence_max"] = params.clean_ec_confidence_max + if params.sequence_length: + base_params["sequence_length"] = params.sequence_length + + # Next page + if offset + limit < total: + next_params = {**base_params, "offset": offset + limit} + next_url = f"{base_url}?{urlencode(next_params, doseq=True)}" + + # Previous page + if offset > 0: + prev_offset = max(0, offset - limit) + prev_params = {**base_params, "offset": prev_offset} + previous_url = f"{base_url}?{urlencode(prev_params, doseq=True)}" return CLEANTypeaheadResponse( field_name=params.field_name, search=params.search, - matches=data + matches=matches, + search_context=search_context, + total=total, + limit=limit, + offset=offset, + next=next_url, + previous=previous_url, ) except Exception as e: logger.error(f"Error getting data: {e}") raise HTTPException(status_code=500, detail=f"Error retrieving data: {str(e)}") + +@router.get("/curation-statuses", summary="Get available curation status options") +async def get_curation_statuses() -> CLEANCurationStatusResponse: + """ + Get the list of available curation status options for filtering. + """ + return CLEANCurationStatusResponse( + statuses=[ + CurationStatusOption(value="reviewed", label="Reviewed (Swiss-Prot)"), + CurationStatusOption(value="unreviewed", label="Unreviewed (TrEMBL)"), + ] + ) + + def parse_ec_lookup_params( search: str = Query( None, diff --git a/docs/filter-requirements.md b/docs/filter-requirements.md new file mode 100644 index 0000000..3988ff3 --- /dev/null +++ b/docs/filter-requirements.md @@ -0,0 +1,414 @@ +# Filter Feature - Backend Requirements + +## Overview + +Backend API enhancements to support the filter feature on the database-search page. This includes two new endpoints and one modification to the existing search endpoint. + +--- + +## Summary of Changes + +| Change | Type | Description | +|--------|------|-------------| +| `GET /api/v1/curation-statuses` | New endpoint | Returns available curation status options | +| `GET /api/v1/typeahead` | Enhancement | Add search context params, pagination, and `predicted_ec` field | +| `curation_status` param on `/api/v1/search` | Enhancement | Adds curation status filtering to existing search | + +--- + +## Enhancement: Typeahead Endpoint + +### `GET /api/v1/typeahead` + +Enhance the existing typeahead endpoint to support: +1. **Search context parameters** — Constrain results to current search/filter context +2. **Pagination** — Handle high-cardinality fields +3. **New field: `predicted_ec`** — Support EC number typeahead + +### Updated Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `field_name` | string | No | Field to search. **Updated enum:** `accession`, `organism`, `protein_name`, `gene_name`, `uniprot_id`, `predicted_ec` (new). Default: `organism` | +| `search` | string | Yes | Search term (minimum 3 characters) | +| `limit` | integer | No | Maximum results per page. Default: `20` | +| `offset` | integer | No | Number of results to skip. Default: `0` | +| **Search context params** | | | | +| `accession` | array[string] | No | Filter by UniProt Accession | +| `organism` | array[string] | No | Filter by Organism | +| `protein` | array[string] | No | Filter by Protein Name | +| `gene_name` | array[string] | No | Filter by Gene Name | +| `uniprot` | array[string] | No | Filter by UniProt ID | +| `ec_number` | array[string] | No | Filter by predicted EC number | +| `curation_status` | array[string] | No | Filter by curation status | +| `clean_ec_confidence_min` | number | No | Minimum confidence score | +| `clean_ec_confidence_max` | number | No | Maximum confidence score | +| `sequence_length` | string | No | Minimum sequence length | + +### Example Request + +Typeahead for protein names within Homo sapiens records that have reviewed curation status: + +``` +GET /api/v1/typeahead?field_name=protein_name&search=Free&organism=Homo+sapiens&curation_status=reviewed&limit=20&offset=0 +``` + +### Example Response + +```json +{ + "field_name": "protein_name", + "search": "Free", + "search_context": { + "organism": ["Homo sapiens"], + "curation_status": ["reviewed"] + }, + "total": 85, + "limit": 20, + "offset": 0, + "matches": [ + "Free fatty acid receptor 1", + "Free fatty acid receptor 2", + "Free fatty acid receptor 3", + "Free fatty acid receptor 4" + ], + "next": "/api/v1/typeahead?field_name=protein_name&search=Free&organism=Homo+sapiens&curation_status=reviewed&limit=20&offset=20", + "previous": null +} +``` + +### Updated Response Schema + +```yaml +CLEANTypeaheadResponse: + type: object + properties: + field_name: + type: string + enum: + - accession + - organism + - protein_name + - gene_name + - uniprot_id + - predicted_ec + description: The field that was searched + search: + type: string + minLength: 3 + description: The search term used + search_context: + type: object + additionalProperties: true + description: Echo of search context parameters used to constrain results + total: + type: integer + description: Total number of matches across all pages + limit: + type: integer + description: Maximum results per page + default: 20 + offset: + type: integer + description: Number of results skipped + default: 0 + matches: + type: array + items: + type: string + description: List of matching values for this page + next: + anyOf: + - type: string + - type: 'null' + description: URL for next page of results, or null if no more pages + previous: + anyOf: + - type: string + - type: 'null' + description: URL for previous page of results, or null if on first page + required: + - field_name + - search + - total + - limit + - offset + - matches +``` + +### Implementation Notes + +**Search Context Filtering:** +- When search context params are provided, the typeahead query should only return values that exist within records matching those constraints +- Example: `?field_name=protein_name&search=Free&organism=Homo+sapiens` returns only protein names containing "Free" that appear in Homo sapiens records + +**Predicted EC Field:** +- The `predicted_ec` field searches within the `predicted_ec[].ec_number` array +- Returns distinct EC numbers matching the search term + +**Pagination:** +- Results should be sorted alphabetically for consistency +- `total` reflects the full count of matches, not just the current page +- `next` and `previous` URLs should include all original query parameters + +**Backward Compatibility:** +- If no search context params or pagination params are provided, behavior matches existing endpoint +- New fields (`search_context`, `total`, `next`, `previous`) are additive + +--- + +## New Endpoint: Curation Statuses + +### `GET /api/v1/curation-statuses` + +Returns the list of available curation status values for filtering. + +### Request + +No parameters required. + +### Response + +**Status:** `200 OK` + +**Content-Type:** `application/json` + +```json +{ + "statuses": [ + { + "value": "reviewed", + "label": "Reviewed (Swiss-Prot)" + }, + { + "value": "unreviewed", + "label": "Unreviewed (TrEMBL)" + } + ] +} +``` + +### Response Schema + +```yaml +CLEANCurationStatusResponse: + type: object + properties: + statuses: + type: array + items: + $ref: '#/components/schemas/CurationStatusOption' + description: List of available curation status options + +CurationStatusOption: + type: object + properties: + value: + type: string + description: Value to use in API filter requests + label: + type: string + description: Human-readable display label + required: + - value + - label +``` + +### Notes + +- This endpoint can return hardcoded values or query distinct values from the database +- Values should match exactly what is stored in the `curation_status` field +- Labels are for display purposes in the UI + +--- + +## Enhancement: Search Endpoint + +### `GET /api/v1/search` + +Add `curation_status` as a new filter parameter. + +### New Parameter + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `curation_status` | array[string] | No | Filter by curation status. Accepts multiple values (OR logic). | + +### Valid Values + +- `reviewed` — Reviewed (Swiss-Prot) +- `unreviewed` — Unreviewed (TrEMBL) + +### Example Requests + +Single value: +``` +GET /api/v1/search?protein=Free+fatty+acid+receptor+2&curation_status=reviewed +``` + +Multiple values (OR logic): +``` +GET /api/v1/search?protein=Free+fatty+acid+receptor+2&curation_status=reviewed&curation_status=unreviewed +``` + +### Filter Logic + +- Multiple `curation_status` values use OR logic +- `curation_status` combined with other filters uses AND logic + +**Example:** +``` +GET /api/v1/search?organism=Homo+sapiens&curation_status=reviewed&clean_ec_confidence_min=0.8 +``` + +Translates to: +```sql +WHERE organism = 'Homo sapiens' + AND curation_status = 'reviewed' + AND clean_ec_confidence >= 0.8 +``` + +### Updated OpenAPI Spec Addition + +```yaml +parameters: + - name: curation_status + in: query + required: false + schema: + anyOf: + - type: array + items: + type: string + enum: + - reviewed + - unreviewed + - type: 'null' + description: Curation status filter + title: Curation Status + description: Filter by curation status (reviewed or unreviewed) +``` + +--- + +## Data Model Reference + +For context, relevant fields from the existing `CLEANDataBase` schema: + +```yaml +CLEANDataBase: + properties: + curation_status: + anyOf: + - type: string + - type: 'null' + title: Curation Status + description: Status of the curation for the Uniprot record. + + predicted_ec: + anyOf: + - items: + $ref: '#/components/schemas/ECNumberConfidence' + type: array + - type: 'null' + title: Predicted Ec + description: List of CLEAN predicted EC numbers with associated confidence scores. + +ECNumberConfidence: + properties: + ec_number: + type: string + title: Ec Number + score: + type: number + title: Score + required: + - ec_number + - score +``` + +--- + +## Error Responses + +All endpoints should return standard error responses: + +### 422 Validation Error + +```json +{ + "detail": [ + { + "loc": ["query", "curation_status"], + "msg": "Invalid curation status value", + "type": "value_error" + } + ] +} +``` + +### 500 Internal Server Error + +```json +{ + "detail": "Internal server error" +} +``` + +--- + +## Performance Considerations + +### `/api/v1/typeahead` (Enhanced) + +- Search context filtering adds query complexity; ensure proper indexing on all filterable fields +- For `predicted_ec` field, ensure index on `predicted_ec.ec_number` (JSON array field) +- Consider caching common search context + field combinations +- Pagination reduces payload size for high-cardinality fields +- Add database indexes for text search (prefix matching) on typeahead fields + +### `/api/v1/search` with `curation_status` + +- Add database index on `curation_status` column if not already present +- Monitor query performance after deployment + +--- + +## Testing Requirements + +### `/api/v1/typeahead` (Enhanced) + +**Search Context:** +- Returns only values existing within constrained result set +- Multiple context params combine with AND logic +- Empty context returns matches from full database (backward compatible) + +**Pagination:** +- Default limit is 20 +- `offset` correctly skips results +- `total` reflects full match count +- `next`/`previous` URLs are correct and include all query params +- Last page has `next: null` +- First page has `previous: null` + +**New Field (`predicted_ec`):** +- Returns distinct EC numbers matching search term +- Searches within `predicted_ec[].ec_number` array +- Respects search context constraints + +**Backward Compatibility:** +- Existing calls without new params still work +- Response includes new fields without breaking existing clients + +### `/api/v1/curation-statuses` + +- Returns expected status options +- Response format matches schema + +### `/api/v1/search` with `curation_status` + +- Single curation_status value filters correctly +- Multiple curation_status values use OR logic +- Combines correctly with other filters (AND logic) +- Invalid curation_status value returns 422 +- Null/missing curation_status returns all records (no filter) \ No newline at end of file