Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
262 changes: 235 additions & 27 deletions app/db/queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,16 @@ async def build_conditions(
conditions.append(f"amino_acids >= ${param_idx}")
query_params[param_name] = params.sequence_length

if params.curation_status is not None:
column_conditions = []
for value in params.curation_status:
param_idx += 1
param_name = f"param_{param_idx}"
column_conditions.append(f"LOWER(curation_status) = LOWER(${param_idx})")
query_params[param_name] = value
if column_conditions:
conditions.append(f"({' OR '.join(column_conditions)})")

# Combine all conditions with AND logic
where_clause = " AND ".join(conditions) if conditions else "TRUE"

Expand Down Expand Up @@ -151,41 +161,239 @@ async def get_total_count(db: Database, params: CLEANSearchQueryParams) -> int:
result = await db.fetchval(query, *query_args)
return result

def _has_search_context(params: CLEANTypeaheadQueryParams) -> bool:
"""Check if any search context filters are provided."""
return any([
params.accession,
params.organism,
params.protein_name,
params.gene_name,
params.uniprot_id,
params.clean_ec_number,
params.curation_status,
params.clean_ec_confidence_min,
params.clean_ec_confidence_max,
params.sequence_length,
])


def _build_typeahead_context_conditions(
params: CLEANTypeaheadQueryParams,
start_param_idx: int = 1,
) -> Tuple[str, Dict[str, Any], int]:
"""Build SQL conditions from typeahead context parameters."""
conditions = []
query_params = {}
param_idx = start_param_idx

# Process string filters (case-insensitive exact matches with OR logic within columns)
string_columns = {
"accession": params.accession,
"protein_name": params.protein_name,
"organism": params.organism,
"gene_name": params.gene_name,
"uniprot_id": params.uniprot_id,
}

for column, values in string_columns.items():
if values:
column_conditions = []
for value in values:
param_idx += 1
param_name = f"param_{param_idx}"
if column == "accession":
column_conditions.append(f"pua.{column} = UPPER(${param_idx})")
else:
column_conditions.append(f"LOWER(pua.{column}) = LOWER(${param_idx})")
query_params[param_name] = value
if column_conditions:
conditions.append(f"({' OR '.join(column_conditions)})")

if params.clean_ec_number is not None:
column_conditions = []
for value in params.clean_ec_number:
param_idx += 1
param_name = f"param_{param_idx}"
if value.endswith("-"):
column_conditions.append(f"puace.clean_ec_number LIKE ${param_idx}")
query_params[param_name] = re.sub(r'-.*$', '%', value)
else:
column_conditions.append(f"puace.clean_ec_number = ${param_idx}")
query_params[param_name] = value
if column_conditions:
conditions.append(f"({' OR '.join(column_conditions)})")

if params.clean_ec_confidence_min is not None:
param_idx += 1
param_name = f"param_{param_idx}"
conditions.append(f"puace.clean_ec_confidence > ${param_idx}")
query_params[param_name] = params.clean_ec_confidence_min

if params.clean_ec_confidence_max is not None:
param_idx += 1
param_name = f"param_{param_idx}"
conditions.append(f"puace.clean_ec_confidence < ${param_idx}")
query_params[param_name] = params.clean_ec_confidence_max

if params.sequence_length is not None:
param_idx += 1
param_name = f"param_{param_idx}"
conditions.append(f"pua.amino_acids >= ${param_idx}")
query_params[param_name] = params.sequence_length

if params.curation_status is not None:
column_conditions = []
for value in params.curation_status:
param_idx += 1
param_name = f"param_{param_idx}"
column_conditions.append(f"LOWER(pua.curation_status) = LOWER(${param_idx})")
query_params[param_name] = value
if column_conditions:
conditions.append(f"({' OR '.join(column_conditions)})")

where_clause = " AND ".join(conditions) if conditions else "TRUE"
return where_clause, query_params, param_idx


async def get_typeahead_suggestions(db: Database, params: CLEANTypeaheadQueryParams
) -> List[str]:
"""Get typeahead suggestions based on the query parameters."""
) -> Tuple[List[str], int]:
"""Get typeahead suggestions based on the query parameters.

Returns a tuple of (matches, total_count).
"""
search = params.search.strip()
if len(search) < 3:
raise ValueError("Search term must be at least 3 characters long.")

if params.field_name == 'accession':
# match the beginning of the string
# accessions are stored and indexed in uppercase
search += '%'
query = f"""SELECT DISTINCT accession FROM cleandb.predictions_uniprot_annot WHERE accession LIKE UPPER($1) ORDER BY 1 ASC"""
elif params.field_name == 'organism':
search = '%' + search + '%'
# match any part of the string
query = f"""SELECT DISTINCT organism FROM cleandb.predictions_uniprot_annot_mv01 WHERE organism_lower LIKE LOWER($1) ORDER BY 1 ASC"""
elif params.field_name == 'protein_name':
# match any part of the string
search = '%' + search + '%'
query = f"""SELECT DISTINCT protein_name FROM cleandb.predictions_uniprot_annot_mv02 WHERE protein_name_lower LIKE LOWER($1) ORDER BY 1 ASC"""
elif params.field_name == 'gene_name':
# match any part of the string (note we have gene names that start with an apostrophe, for example, which the user might not expect)
search = '%' + search + '%'
query = f"""SELECT DISTINCT gene_name FROM cleandb.predictions_uniprot_annot_mv03 WHERE gene_name_lower LIKE LOWER($1) ORDER BY 1 ASC"""
elif params.field_name == 'uniprot_id':
search = '%' + search + '%'
query = f"""SELECT DISTINCT uniprot_id FROM cleandb.predictions_uniprot_annot WHERE LOWER(uniprot_id) LIKE LOWER($1) ORDER BY 1 ASC"""
else:
limit = params.limit or 20
offset = params.offset or 0
has_context = _has_search_context(params)

# Field-specific configuration
field_config = {
'accession': {
'search_pattern': lambda s: s + '%', # match beginning
'search_condition': 'pua.accession LIKE UPPER($1)',
'mv_table': None,
'mv_search_condition': None,
'column': 'accession',
'result_column': 'accession',
},
'organism': {
'search_pattern': lambda s: '%' + s + '%', # match anywhere
'search_condition': 'LOWER(pua.organism) LIKE LOWER($1)',
'mv_table': 'cleandb.predictions_uniprot_annot_mv01',
'mv_search_condition': 'organism_lower LIKE LOWER($1)',
'column': 'organism',
'result_column': 'organism',
},
'protein_name': {
'search_pattern': lambda s: '%' + s + '%',
'search_condition': 'LOWER(pua.protein_name) LIKE LOWER($1)',
'mv_table': 'cleandb.predictions_uniprot_annot_mv02',
'mv_search_condition': 'protein_name_lower LIKE LOWER($1)',
'column': 'protein_name',
'result_column': 'protein_name',
},
'gene_name': {
'search_pattern': lambda s: '%' + s + '%',
'search_condition': 'LOWER(pua.gene_name) LIKE LOWER($1)',
'mv_table': 'cleandb.predictions_uniprot_annot_mv03',
'mv_search_condition': 'gene_name_lower LIKE LOWER($1)',
'column': 'gene_name',
'result_column': 'gene_name',
},
'uniprot_id': {
'search_pattern': lambda s: '%' + s + '%',
'search_condition': 'LOWER(pua.uniprot_id) LIKE LOWER($1)',
'mv_table': None,
'mv_search_condition': None,
'column': 'uniprot_id',
'result_column': 'uniprot_id',
},
'predicted_ec': {
'search_pattern': lambda s: s + '%', # match beginning of EC number
'search_condition': 'puace.clean_ec_number LIKE $1',
'mv_table': None,
'mv_search_condition': None,
'column': 'clean_ec_number',
'result_column': 'clean_ec_number',
},
}

if params.field_name not in field_config:
raise ValueError(f"Invalid field name: {params.field_name}")

query += f" LIMIT {params.limit or 10}"
config = field_config[params.field_name]
search_term = config['search_pattern'](search)

if not has_context:
# No search context - use materialized views for better performance when available
if params.field_name == 'predicted_ec':
# Query the EC table directly
count_query = f"""SELECT COUNT(DISTINCT clean_ec_number) FROM cleandb.predictions_uniprot_annot_clean_ec WHERE clean_ec_number LIKE $1"""
data_query = f"""SELECT DISTINCT clean_ec_number FROM cleandb.predictions_uniprot_annot_clean_ec WHERE clean_ec_number LIKE $1 ORDER BY 1 ASC LIMIT {limit} OFFSET {offset}"""
elif config['mv_table']:
# Use materialized view
count_query = f"""SELECT COUNT(DISTINCT {config['column']}) FROM {config['mv_table']} WHERE {config['mv_search_condition']}"""
data_query = f"""SELECT DISTINCT {config['column']} FROM {config['mv_table']} WHERE {config['mv_search_condition']} ORDER BY 1 ASC LIMIT {limit} OFFSET {offset}"""
else:
# Query main table directly
count_query = f"""SELECT COUNT(DISTINCT {config['column']}) FROM cleandb.predictions_uniprot_annot pua WHERE {config['search_condition']}"""
data_query = f"""SELECT DISTINCT {config['column']} FROM cleandb.predictions_uniprot_annot pua WHERE {config['search_condition']} ORDER BY 1 ASC LIMIT {limit} OFFSET {offset}"""

total = await db.fetchval(count_query, search_term)
records = await db.fetch(data_query, search_term)
return [record[config['result_column']] for record in records], total

# Execute the query
records = await db.fetch(query, search)
return [record[params.field_name] for record in records]
else:
# Has search context - need to join with main table and apply filters
context_where, context_params, param_idx = _build_typeahead_context_conditions(params, start_param_idx=1)

# The search term will be $1, context params start from $2
# Rebuild context conditions with offset
context_where, context_params, _ = _build_typeahead_context_conditions(params, start_param_idx=1)

# Build the query based on field type
if params.field_name == 'predicted_ec':
# Need to join with EC table
base_query = f"""
FROM cleandb.predictions_uniprot_annot pua
INNER JOIN cleandb.predictions_uniprot_annot_clean_ec puace
ON puace.predictions_uniprot_annot_id = pua.predictions_uniprot_annot_id
WHERE puace.clean_ec_number LIKE $1
AND {context_where}
"""
select_column = "puace.clean_ec_number"
else:
# Check if we need to join with EC table for context filtering
needs_ec_join = params.clean_ec_number is not None or params.clean_ec_confidence_min is not None or params.clean_ec_confidence_max is not None

if needs_ec_join:
base_query = f"""
FROM cleandb.predictions_uniprot_annot pua
INNER JOIN cleandb.predictions_uniprot_annot_clean_ec puace
ON puace.predictions_uniprot_annot_id = pua.predictions_uniprot_annot_id
WHERE {config['search_condition']}
AND {context_where}
"""
else:
base_query = f"""
FROM cleandb.predictions_uniprot_annot pua
WHERE {config['search_condition']}
AND {context_where}
"""
select_column = f"pua.{config['column']}"

count_query = f"SELECT COUNT(*) FROM (SELECT DISTINCT {select_column} {base_query}) sub"
data_query = f"SELECT DISTINCT {select_column} {base_query} ORDER BY 1 ASC LIMIT {limit} OFFSET {offset}"

# Build query args: search_term first, then context params
query_args = [search_term] + list(context_params.values())

total = await db.fetchval(count_query, *query_args)
records = await db.fetch(data_query, *query_args)
return [record[config['result_column']] for record in records], total

async def get_ec_suggestions(db: Database, params: CLEANECLookupQueryParams
) -> List[Dict[str, str]]:
Expand Down
46 changes: 45 additions & 1 deletion app/models/clean_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ class CLEANSearchResponse(BaseModel):

class CLEANTypeaheadResponse(BaseModel):
"""Model for the response of a CLEAN typeahead query."""
field_name: Literal['accession', 'organism', 'protein_name', 'gene_name', 'uniprot_id'] = Field(
field_name: Literal['accession', 'organism', 'protein_name', 'gene_name', 'uniprot_id', 'predicted_ec'] = Field(
'organism',
description="Which field to search in",
),
Expand All @@ -125,6 +125,30 @@ class CLEANTypeaheadResponse(BaseModel):
[],
description="List of results matching the search term."
)
search_context: Optional[dict] = Field(
None,
description="The search context filters that were applied to the typeahead query."
)
total: int = Field(
0,
description="Total number of matching results (before pagination)."
)
limit: int = Field(
20,
description="Maximum number of results returned."
)
offset: int = Field(
0,
description="Number of results skipped."
)
next: Optional[str] = Field(
None,
description="Link to the next page of results."
)
previous: Optional[str] = Field(
None,
description="Link to the previous page of results."
)

class CLEANECLookupMatch(BaseModel):
"""Model for a single match in the CLEAN EC lookup response."""
Expand All @@ -145,4 +169,24 @@ class CLEANECLookupResponse(BaseModel):
matches: List[CLEANECLookupMatch] = Field(
[],
description="List of matches for the EC lookup."
)


class CurationStatusOption(BaseModel):
"""Model for a curation status option."""
value: str = Field(
...,
description="The value of the curation status (e.g., 'reviewed', 'unreviewed')."
)
label: str = Field(
...,
description="The human-readable label for the curation status."
)


class CLEANCurationStatusResponse(BaseModel):
"""Model for the response of the curation statuses endpoint."""
statuses: List[CurationStatusOption] = Field(
[],
description="List of available curation status options."
)
Loading