77
88import logging
99import re
10+ import string
1011import sys
1112from concurrent .futures import ThreadPoolExecutor
1213from urllib .parse import urljoin
1920API_CONDA_BASE_URL = "https://npe2api.vercel.app/api/conda/"
2021API_PYPI_BASE_URL = "https://npe2api.vercel.app/api/pypi/"
2122API_MANIFEST_BASE_URL = "https://npe2api.vercel.app/api/manifest/"
23+ HOME_PYPI_REGEX = r"(.*)(pypi.org)(/)(project)(/)(.*)"
24+ HOME_GITHUB_REGEX = r"(http(s)?)(:(//)?)(.*)(github.com)(/)?(.+)(/)(.+)(\.git)?$"
2225
2326# Define columns needed for the plugin html page
2427PLUGIN_PAGE_COLUMNS = [
3134 "author" ,
3235 "package_metadata_author_email" ,
3336 "license" ,
34- "home" ,
37+ "home_github" ,
38+ "home_pypi" ,
39+ "home_other" ,
3540 "package_metadata_home_page" ,
3641 "summary" ,
3742 "package_metadata_requires_python" ,
@@ -114,6 +119,57 @@ def classify_url(url: str) -> str:
114119 return "other"
115120
116121
122+ def expand_proj_url (plugin_data : dict ) -> None :
123+ """
124+ Expands the project URL in the plugin data dictionary.
125+
126+ This function checks if the 'project_url' key exists in
127+ the plugin data and extracts homepage and github repository URLs,
128+ if present.
129+
130+ Parameters
131+ ----------
132+ plugin_data : dict
133+ The dictionary containing plugin data.
134+
135+ Returns
136+ -------
137+ None
138+ The function modifies the `plugin_data` dictionary in place.
139+ """
140+ urls = plugin_data .get ("project_url" , [])
141+
142+ # If urls do not exist, we try using the 'home_page' key (like in the old metadata spec).
143+ if not urls and "home_page" in plugin_data :
144+ urls = f"homepage, { plugin_data ['home_page' ]} "
145+
146+ plugin_data ["home_github" ] = ""
147+ plugin_data ["home_other" ] = ""
148+ for url_info in urls :
149+ label , url = url_info .split (", " )
150+ plugin_data [normalize_label (label )] = url
151+ if re .match (HOME_GITHUB_REGEX , url ):
152+ # If url matches github repository link structure,
153+ # we display the github icon.
154+ # Otherwise, display the homepage label
155+ # as some other url
156+ plugin_data ["home_github" ] = url
157+ elif label == "homepage" :
158+ plugin_data ["home_other" ] = url
159+ del plugin_data ["project_url" ]
160+
161+
162+ def normalize_label (label : str ) -> str :
163+ """Normalize project URL label.
164+
165+ Code reproduced from:
166+ https://packaging.python.org/en/latest/specifications/well-known-project-urls/#label-normalization
167+ """
168+ chars_to_remove = string .punctuation + string .whitespace
169+ removal_map = str .maketrans ("" , "" , chars_to_remove )
170+ return label .translate (removal_map ).lower ()
171+
172+
117173def flatten_and_merge (original , additional , parent_key = "" ) -> None :
118174 """
119175 Recursively flattens a nested dictionary or list of dictionaries and merges the result into the original dictionary.
@@ -258,13 +314,13 @@ def process_plugin(plugin):
258314 # need pypi info to get the initial and latest release date
259315 pypi_info = fetch (urljoin (API_PYPI_BASE_URL , plugin_normalized_name ))
260316
317+ expand_proj_url (plugin_data )
318+
261319 if conda_info :
262320 # we only want a limited set of conda info
263321 conda_info = {
264322 "conda_name" : conda_info ["name" ],
265323 "conda_html_url" : conda_info ["html_url" ],
266- # TODO: this should come from project_url not conda info
267- "home" : conda_info ["home" ],
268324 }
269325 plugin_data .update (conda_info )
270326
@@ -278,10 +334,14 @@ def process_plugin(plugin):
278334 last_updated_date = get_version_release_date (
279335 pypi_info , plugin_latest_release
280336 )
337+
338+ # grab pypi project link
339+ home_pypi = pypi_info ["info" ].get ("package_url" , "" )
281340 plugin_data .update (
282341 {
283342 "created_at" : initial_release_date ,
284343 "modified_at" : last_updated_date ,
344+ "home_pypi" : home_pypi ,
285345 }
286346 )
287347
@@ -335,23 +395,7 @@ def process_plugin(plugin):
335395 df_plugins ["modified_at" ], format = "mixed"
336396 ).dt .date
337397
338- # Set a temporary helper column 'home_type' by classifying the 'home' URL to a common package repository name, like 'pypi', 'github', or 'other'
339- df_plugins ["home_type" ] = df_plugins ["home" ].apply (classify_url )
340- # Using the 'home_type' column, create new colums for 'pypi', 'github', and 'other'
341- df_plugins ["home_pypi" ] = df_plugins ["home" ].where (
342- df_plugins ["home_type" ] == "pypi" , ""
343- )
344- df_plugins ["home_github" ] = df_plugins ["home" ].where (
345- df_plugins ["home_type" ] == "github" , ""
346- )
347- df_plugins ["home_other" ] = df_plugins ["home" ].where (
348- df_plugins ["home_type" ] == "other" , ""
349- )
350-
351- # Delete the temporary 'home_type' column as it is no longer needed
352- df_plugins .drop ("home_type" , axis = 1 , inplace = True )
353-
354- # Perform row-wise cleaning of the DataFrame for author, license, and home_pypi fields
398+ # Perform row-wise cleaning of the DataFrame for author and license fields
355399 for index , row in df_plugins .iterrows ():
356400 # Check if 'author' is NaN or contains quotation marks
357401 if pd .isna (row ["author" ]) or '"' in str (row ["author" ]):
@@ -371,11 +415,5 @@ def process_plugin(plugin):
371415 elif '"' in str (row ["license" ]):
372416 df_plugins .at [index , "license" ] = f"{ row ['license' ][:30 ]} ..."
373417
374- # Fill home_pypi
375- if not row ["home_pypi" ]:
376- df_plugins .at [index , "home_pypi" ] = (
377- f"https://pypi.org/project/{ row ['name' ]} "
378- )
379-
380418 # Save the final DataFrame of plugin page information to a CSV file
381419 df_plugins .to_csv (f"{ data_dir } /final_plugins.csv" )
0 commit comments