Skip to content

Commit d02cd89

Browse files
authored
Parse pypi and github URLs correctly (#84)
Prior to this PR our retrieval of github URLs was missing many plugins because it did not inspect the `project_url` field. This PR updates the logic for fetching this information to include checking both the `home_page` and `project_url`. It also retrieves the PyPI url from the PyPI api information as this is likely to be more stable. In future we should also add a link to the conda URL information (if present), and potentially parse documentation/bugtracker/etc. links for display. However that would require more icons/CSS/layout changes and I'd rather get #75 and #81 done first. I thought this was valuable enough to get it in asap.
2 parents 3d1d9a5 + a4e5cbc commit d02cd89

File tree

1 file changed

+64
-26
lines changed

1 file changed

+64
-26
lines changed

fetch_napari_data.py

Lines changed: 64 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
import logging
99
import re
10+
import string
1011
import sys
1112
from concurrent.futures import ThreadPoolExecutor
1213
from urllib.parse import urljoin
@@ -19,6 +20,8 @@
1920
API_CONDA_BASE_URL = "https://npe2api.vercel.app/api/conda/"
2021
API_PYPI_BASE_URL = "https://npe2api.vercel.app/api/pypi/"
2122
API_MANIFEST_BASE_URL = "https://npe2api.vercel.app/api/manifest/"
23+
HOME_PYPI_REGEX = r"(.*)(pypi.org)(/)(project)(/)(.*)"
24+
HOME_GITHUB_REGEX = r"(http(s)?)(:(//)?)(.*)(github.com)(/)?(.+)(/)(.+)(\.git)?$"
2225

2326
# Define columns needed for the plugin html page
2427
PLUGIN_PAGE_COLUMNS = [
@@ -31,7 +34,9 @@
3134
"author",
3235
"package_metadata_author_email",
3336
"license",
34-
"home",
37+
"home_github",
38+
"home_pypi",
39+
"home_other",
3540
"package_metadata_home_page",
3641
"summary",
3742
"package_metadata_requires_python",
@@ -114,6 +119,57 @@ def classify_url(url: str) -> str:
114119
return "other"
115120

116121

122+
def expand_proj_url(plugin_data: dict) -> None:
123+
"""
124+
Expands the project URL in the plugin data dictionary.
125+
126+
This function checks if the 'project_url' key exists in
127+
the plugin data and extracts homepage and github repository URLs,
128+
if present.
129+
130+
Parameters
131+
----------
132+
plugin_data : dict
133+
The dictionary containing plugin data.
134+
135+
Returns
136+
-------
137+
None
138+
The function modifies the `plugin_data` dictionary in place.
139+
"""
140+
urls = plugin_data.get("project_url", [])
141+
142+
# If urls do not exist, we try using the 'home_page' key (like in the old metadata spec).
143+
if not urls and "home_page" in plugin_data:
144+
urls = f"homepage, {plugin_data['home_page']}"
145+
146+
plugin_data["home_github"] = ""
147+
plugin_data["home_other"] = ""
148+
for url_info in urls:
149+
label, url = url_info.split(", ")
150+
plugin_data[normalize_label(label)] = url
151+
if re.match(HOME_GITHUB_REGEX, url):
152+
# If url matches github repository link structure,
153+
# we display the github icon.
154+
# Otherwise, display the homepage label
155+
# as some other url
156+
plugin_data["home_github"] = url
157+
elif label == "homepage":
158+
plugin_data["home_other"] = url
159+
del plugin_data["project_url"]
160+
161+
162+
def normalize_label(label: str) -> str:
163+
"""Normalize project URL label.
164+
165+
Code reproduced from:
166+
https://packaging.python.org/en/latest/specifications/well-known-project-urls/#label-normalization
167+
"""
168+
chars_to_remove = string.punctuation + string.whitespace
169+
removal_map = str.maketrans("", "", chars_to_remove)
170+
return label.translate(removal_map).lower()
171+
172+
117173
def flatten_and_merge(original, additional, parent_key="") -> None:
118174
"""
119175
Recursively flattens a nested dictionary or list of dictionaries and merges the result into the original dictionary.
@@ -258,13 +314,13 @@ def process_plugin(plugin):
258314
# need pypi info to get the initial and latest release date
259315
pypi_info = fetch(urljoin(API_PYPI_BASE_URL, plugin_normalized_name))
260316

317+
expand_proj_url(plugin_data)
318+
261319
if conda_info:
262320
# we only want a limited set of conda info
263321
conda_info = {
264322
"conda_name": conda_info["name"],
265323
"conda_html_url": conda_info["html_url"],
266-
# TODO: this should come from project_url not conda info
267-
"home": conda_info["home"],
268324
}
269325
plugin_data.update(conda_info)
270326

@@ -278,10 +334,14 @@ def process_plugin(plugin):
278334
last_updated_date = get_version_release_date(
279335
pypi_info, plugin_latest_release
280336
)
337+
338+
# grab pypi project link
339+
home_pypi = pypi_info["info"].get("package_url", "")
281340
plugin_data.update(
282341
{
283342
"created_at": initial_release_date,
284343
"modified_at": last_updated_date,
344+
"home_pypi": home_pypi,
285345
}
286346
)
287347

@@ -335,23 +395,7 @@ def process_plugin(plugin):
335395
df_plugins["modified_at"], format="mixed"
336396
).dt.date
337397

338-
# Set a temporary helper column 'home_type' by classifying the 'home' URL to a common package repository name, like 'pypi', 'github', or 'other'
339-
df_plugins["home_type"] = df_plugins["home"].apply(classify_url)
340-
# Using the 'home_type' column, create new colums for 'pypi', 'github', and 'other'
341-
df_plugins["home_pypi"] = df_plugins["home"].where(
342-
df_plugins["home_type"] == "pypi", ""
343-
)
344-
df_plugins["home_github"] = df_plugins["home"].where(
345-
df_plugins["home_type"] == "github", ""
346-
)
347-
df_plugins["home_other"] = df_plugins["home"].where(
348-
df_plugins["home_type"] == "other", ""
349-
)
350-
351-
# Delete the temporary 'home_type' column as it is no longer needed
352-
df_plugins.drop("home_type", axis=1, inplace=True)
353-
354-
# Perform row-wise cleaning of the DataFrame for author, license, and home_pypi fields
398+
# Perform row-wise cleaning of the DataFrame for author and license fields
355399
for index, row in df_plugins.iterrows():
356400
# Check if 'author' is NaN or contains quotation marks
357401
if pd.isna(row["author"]) or '"' in str(row["author"]):
@@ -371,11 +415,5 @@ def process_plugin(plugin):
371415
elif '"' in str(row["license"]):
372416
df_plugins.at[index, "license"] = f"{row['license'][:30]}..."
373417

374-
# Fill home_pypi
375-
if not row["home_pypi"]:
376-
df_plugins.at[index, "home_pypi"] = (
377-
f"https://pypi.org/project/{row['name']}"
378-
)
379-
380418
# Save the final DataFrame of plugin page information to a CSV file
381419
df_plugins.to_csv(f"{data_dir}/final_plugins.csv")

0 commit comments

Comments
 (0)