moleculemaker · ckouder · Feb 24, 2026 · Jan 30, 2026 · Jan 30, 2026
diff --git a/app/db/queries.py b/app/db/queries.py
@@ -78,6 +78,16 @@ async def build_conditions(
         conditions.append(f"amino_acids >= ${param_idx}")
         query_params[param_name] = params.sequence_length
 
+    if params.curation_status is not None:
+        column_conditions = []
+        for value in params.curation_status:
+            param_idx += 1
+            param_name = f"param_{param_idx}"
+            column_conditions.append(f"LOWER(curation_status) = LOWER(${param_idx})")
+            query_params[param_name] = value
+        if column_conditions:
+            conditions.append(f"({' OR '.join(column_conditions)})")
+
     # Combine all conditions with AND logic
     where_clause = " AND ".join(conditions) if conditions else "TRUE"
 
@@ -151,41 +161,239 @@ async def get_total_count(db: Database, params: CLEANSearchQueryParams) -> int:
     result = await db.fetchval(query, *query_args)
     return result
 
+def _has_search_context(params: CLEANTypeaheadQueryParams) -> bool:
+    """Check if any search context filters are provided."""
+    return any([
+        params.accession,
+        params.organism,
+        params.protein_name,
+        params.gene_name,
+        params.uniprot_id,
+        params.clean_ec_number,
+        params.curation_status,
+        params.clean_ec_confidence_min,
+        params.clean_ec_confidence_max,
+        params.sequence_length,
+    ])
+
+
+def _build_typeahead_context_conditions(
+    params: CLEANTypeaheadQueryParams,
+    start_param_idx: int = 1,
+) -> Tuple[str, Dict[str, Any], int]:
+    """Build SQL conditions from typeahead context parameters."""
+    conditions = []
+    query_params = {}
+    param_idx = start_param_idx
+
+    # Process string filters (case-insensitive exact matches with OR logic within columns)
+    string_columns = {
+        "accession": params.accession,
+        "protein_name": params.protein_name,
+        "organism": params.organism,
+        "gene_name": params.gene_name,
+        "uniprot_id": params.uniprot_id,
+    }
+
+    for column, values in string_columns.items():
+        if values:
+            column_conditions = []
+            for value in values:
+                param_idx += 1
+                param_name = f"param_{param_idx}"
+                if column == "accession":
+                    column_conditions.append(f"pua.{column} = UPPER(${param_idx})")
+                else:
+                    column_conditions.append(f"LOWER(pua.{column}) = LOWER(${param_idx})")
+                query_params[param_name] = value
+            if column_conditions:
+                conditions.append(f"({' OR '.join(column_conditions)})")
+
+    if params.clean_ec_number is not None:
+        column_conditions = []
+        for value in params.clean_ec_number:
+            param_idx += 1
+            param_name = f"param_{param_idx}"
+            if value.endswith("-"):
+                column_conditions.append(f"puace.clean_ec_number LIKE ${param_idx}")
+                query_params[param_name] = re.sub(r'-.*$', '%', value)
+            else:
+                column_conditions.append(f"puace.clean_ec_number = ${param_idx}")
+                query_params[param_name] = value
+        if column_conditions:
+            conditions.append(f"({' OR '.join(column_conditions)})")
+
+    if params.clean_ec_confidence_min is not None:
+        param_idx += 1
+        param_name = f"param_{param_idx}"
+        conditions.append(f"puace.clean_ec_confidence > ${param_idx}")
+        query_params[param_name] = params.clean_ec_confidence_min
+
+    if params.clean_ec_confidence_max is not None:
+        param_idx += 1
+        param_name = f"param_{param_idx}"
+        conditions.append(f"puace.clean_ec_confidence < ${param_idx}")
+        query_params[param_name] = params.clean_ec_confidence_max
+
+    if params.sequence_length is not None:
+        param_idx += 1
+        param_name = f"param_{param_idx}"
+        conditions.append(f"pua.amino_acids >= ${param_idx}")
+        query_params[param_name] = params.sequence_length
+
+    if params.curation_status is not None:
+        column_conditions = []
+        for value in params.curation_status:
+            param_idx += 1
+            param_name = f"param_{param_idx}"
+            column_conditions.append(f"LOWER(pua.curation_status) = LOWER(${param_idx})")
+            query_params[param_name] = value
+        if column_conditions:
+            conditions.append(f"({' OR '.join(column_conditions)})")
+
+    where_clause = " AND ".join(conditions) if conditions else "TRUE"
+    return where_clause, query_params, param_idx
+
+
 async def get_typeahead_suggestions(db: Database, params: CLEANTypeaheadQueryParams
-) -> List[str]:
-    """Get typeahead suggestions based on the query parameters."""
+) -> Tuple[List[str], int]:
+    """Get typeahead suggestions based on the query parameters.
+
+    Returns a tuple of (matches, total_count).
+    """
     search = params.search.strip()
     if len(search) < 3:
         raise ValueError("Search term must be at least 3 characters long.")
 
-    if params.field_name == 'accession':
-        # match the beginning of the string
-        # accessions are stored and indexed in uppercase
-        search += '%'
-        query = f"""SELECT DISTINCT accession FROM cleandb.predictions_uniprot_annot WHERE accession LIKE UPPER($1) ORDER BY 1 ASC"""
-    elif params.field_name == 'organism':
-        search = '%' + search + '%'
-        # match any part of the string
-        query = f"""SELECT DISTINCT organism FROM cleandb.predictions_uniprot_annot_mv01 WHERE organism_lower LIKE LOWER($1) ORDER BY 1 ASC"""
-    elif params.field_name == 'protein_name':
-        # match any part of the string
-        search = '%' + search + '%'
-        query = f"""SELECT DISTINCT protein_name FROM cleandb.predictions_uniprot_annot_mv02 WHERE protein_name_lower LIKE LOWER($1) ORDER BY 1 ASC"""
-    elif params.field_name == 'gene_name':
-        # match any part of the string (note we have gene names that start with an apostrophe, for example, which the user might not expect)
-        search = '%' + search + '%'
-        query = f"""SELECT DISTINCT gene_name FROM cleandb.predictions_uniprot_annot_mv03 WHERE gene_name_lower LIKE LOWER($1) ORDER BY 1 ASC"""
-    elif params.field_name == 'uniprot_id':
-        search = '%' + search + '%'
-        query = f"""SELECT DISTINCT uniprot_id FROM cleandb.predictions_uniprot_annot WHERE LOWER(uniprot_id) LIKE LOWER($1) ORDER BY 1 ASC"""
-    else:
+    limit = params.limit or 20
+    offset = params.offset or 0
+    has_context = _has_search_context(params)
+
+    # Field-specific configuration
+    field_config = {
+        'accession': {
+            'search_pattern': lambda s: s + '%',  # match beginning
+            'search_condition': 'pua.accession LIKE UPPER($1)',
+            'mv_table': None,
+            'mv_search_condition': None,
+            'column': 'accession',
+            'result_column': 'accession',
+        },
+        'organism': {
+            'search_pattern': lambda s: '%' + s + '%',  # match anywhere
+            'search_condition': 'LOWER(pua.organism) LIKE LOWER($1)',
+            'mv_table': 'cleandb.predictions_uniprot_annot_mv01',
+            'mv_search_condition': 'organism_lower LIKE LOWER($1)',
+            'column': 'organism',
+            'result_column': 'organism',
+        },
+        'protein_name': {
+            'search_pattern': lambda s: '%' + s + '%',
+            'search_condition': 'LOWER(pua.protein_name) LIKE LOWER($1)',
+            'mv_table': 'cleandb.predictions_uniprot_annot_mv02',
+            'mv_search_condition': 'protein_name_lower LIKE LOWER($1)',
+            'column': 'protein_name',
+            'result_column': 'protein_name',
+        },
+        'gene_name': {
+            'search_pattern': lambda s: '%' + s + '%',
+            'search_condition': 'LOWER(pua.gene_name) LIKE LOWER($1)',
+            'mv_table': 'cleandb.predictions_uniprot_annot_mv03',
+            'mv_search_condition': 'gene_name_lower LIKE LOWER($1)',
+            'column': 'gene_name',
+            'result_column': 'gene_name',
+        },
+        'uniprot_id': {
+            'search_pattern': lambda s: '%' + s + '%',
+            'search_condition': 'LOWER(pua.uniprot_id) LIKE LOWER($1)',
+            'mv_table': None,
+            'mv_search_condition': None,
+            'column': 'uniprot_id',
+            'result_column': 'uniprot_id',
+        },
+        'predicted_ec': {
+            'search_pattern': lambda s: s + '%',  # match beginning of EC number
+            'search_condition': 'puace.clean_ec_number LIKE $1',
+            'mv_table': None,
+            'mv_search_condition': None,
+            'column': 'clean_ec_number',
+            'result_column': 'clean_ec_number',
+        },
+    }
+
+    if params.field_name not in field_config:
         raise ValueError(f"Invalid field name: {params.field_name}")
 
-    query += f" LIMIT {params.limit or 10}"
+    config = field_config[params.field_name]
+    search_term = config['search_pattern'](search)
+
+    if not has_context:
+        # No search context - use materialized views for better performance when available
+        if params.field_name == 'predicted_ec':
+            # Query the EC table directly
+            count_query = f"""SELECT COUNT(DISTINCT clean_ec_number) FROM cleandb.predictions_uniprot_annot_clean_ec WHERE clean_ec_number LIKE $1"""
+            data_query = f"""SELECT DISTINCT clean_ec_number FROM cleandb.predictions_uniprot_annot_clean_ec WHERE clean_ec_number LIKE $1 ORDER BY 1 ASC LIMIT {limit} OFFSET {offset}"""
+        elif config['mv_table']:
+            # Use materialized view
+            count_query = f"""SELECT COUNT(DISTINCT {config['column']}) FROM {config['mv_table']} WHERE {config['mv_search_condition']}"""
+            data_query = f"""SELECT DISTINCT {config['column']} FROM {config['mv_table']} WHERE {config['mv_search_condition']} ORDER BY 1 ASC LIMIT {limit} OFFSET {offset}"""
+        else:
+            # Query main table directly
+            count_query = f"""SELECT COUNT(DISTINCT {config['column']}) FROM cleandb.predictions_uniprot_annot pua WHERE {config['search_condition']}"""
+            data_query = f"""SELECT DISTINCT {config['column']} FROM cleandb.predictions_uniprot_annot pua WHERE {config['search_condition']} ORDER BY 1 ASC LIMIT {limit} OFFSET {offset}"""
+
+        total = await db.fetchval(count_query, search_term)
+        records = await db.fetch(data_query, search_term)
+        return [record[config['result_column']] for record in records], total
 
-    # Execute the query
-    records = await db.fetch(query, search)
-    return [record[params.field_name] for record in records]
+    else:
+        # Has search context - need to join with main table and apply filters
+        context_where, context_params, param_idx = _build_typeahead_context_conditions(params, start_param_idx=1)
+
+        # The search term will be $1, context params start from $2
+        # Rebuild context conditions with offset
+        context_where, context_params, _ = _build_typeahead_context_conditions(params, start_param_idx=1)
+
+        # Build the query based on field type
+        if params.field_name == 'predicted_ec':
+            # Need to join with EC table
+            base_query = f"""
+                FROM cleandb.predictions_uniprot_annot pua
+                INNER JOIN cleandb.predictions_uniprot_annot_clean_ec puace
+                    ON puace.predictions_uniprot_annot_id = pua.predictions_uniprot_annot_id
+                WHERE puace.clean_ec_number LIKE $1
+                    AND {context_where}
+            """
+            select_column = "puace.clean_ec_number"
+        else:
+            # Check if we need to join with EC table for context filtering
+            needs_ec_join = params.clean_ec_number is not None or params.clean_ec_confidence_min is not None or params.clean_ec_confidence_max is not None
+
+            if needs_ec_join:
+                base_query = f"""
+                    FROM cleandb.predictions_uniprot_annot pua
+                    INNER JOIN cleandb.predictions_uniprot_annot_clean_ec puace
+                        ON puace.predictions_uniprot_annot_id = pua.predictions_uniprot_annot_id
+                    WHERE {config['search_condition']}
+                        AND {context_where}
+                """
+            else:
+                base_query = f"""
+                    FROM cleandb.predictions_uniprot_annot pua
+                    WHERE {config['search_condition']}
+                        AND {context_where}
+                """
+            select_column = f"pua.{config['column']}"
+
+        count_query = f"SELECT COUNT(*) FROM (SELECT DISTINCT {select_column} {base_query}) sub"
+        data_query = f"SELECT DISTINCT {select_column} {base_query} ORDER BY 1 ASC LIMIT {limit} OFFSET {offset}"
+
+        # Build query args: search_term first, then context params
+        query_args = [search_term] + list(context_params.values())
+
+        total = await db.fetchval(count_query, *query_args)
+        records = await db.fetch(data_query, *query_args)
+        return [record[config['result_column']] for record in records], total
 
 async def get_ec_suggestions(db: Database, params: CLEANECLookupQueryParams
 ) -> List[Dict[str, str]]:

diff --git a/app/models/clean_data.py b/app/models/clean_data.py
@@ -112,7 +112,7 @@ class CLEANSearchResponse(BaseModel):
 
 class CLEANTypeaheadResponse(BaseModel):
     """Model for the response of a CLEAN typeahead query."""
-    field_name: Literal['accession', 'organism', 'protein_name', 'gene_name', 'uniprot_id'] = Field(
+    field_name: Literal['accession', 'organism', 'protein_name', 'gene_name', 'uniprot_id', 'predicted_ec'] = Field(
         'organism',
         description="Which field to search in",
     ),
@@ -125,6 +125,30 @@ class CLEANTypeaheadResponse(BaseModel):
         [],
         description="List of results matching the search term."
     )
+    search_context: Optional[dict] = Field(
+        None,
+        description="The search context filters that were applied to the typeahead query."
+    )
+    total: int = Field(
+        0,
+        description="Total number of matching results (before pagination)."
+    )
+    limit: int = Field(
+        20,
+        description="Maximum number of results returned."
+    )
+    offset: int = Field(
+        0,
+        description="Number of results skipped."
+    )
+    next: Optional[str] = Field(
+        None,
+        description="Link to the next page of results."
+    )
+    previous: Optional[str] = Field(
+        None,
+        description="Link to the previous page of results."
+    )
 
 class CLEANECLookupMatch(BaseModel):
     """Model for a single match in the CLEAN EC lookup response."""
@@ -145,4 +169,24 @@ class CLEANECLookupResponse(BaseModel):
     matches: List[CLEANECLookupMatch] = Field(
         [],
         description="List of matches for the EC lookup."
+    )
+
+
+class CurationStatusOption(BaseModel):
+    """Model for a curation status option."""
+    value: str = Field(
+        ...,
+        description="The value of the curation status (e.g., 'reviewed', 'unreviewed')."
+    )
+    label: str = Field(
+        ...,
+        description="The human-readable label for the curation status."
+    )
+
+
+class CLEANCurationStatusResponse(BaseModel):
+    """Model for the response of the curation statuses endpoint."""
+    statuses: List[CurationStatusOption] = Field(
+        [],
+        description="List of available curation status options."
     )