From 3c839aa91566840309eeaf6db7e9da170ad8d1a7 Mon Sep 17 00:00:00 2001
From: sarthak Gangopadhyay <sarthakgy@google.com>
Date: Wed, 11 Feb 2026 10:57:40 -0500
Subject: [PATCH] Added new sample for visualizing image snippets in Vertex AI
 Managed Datasets

---
 .../create_vertex_dataset.md                  |  57 +++++++++
 .../create_vertex_dataset.py                  | 117 ++++++++++++++++++
 2 files changed, 174 insertions(+)
 create mode 100644 street_view_insights/samples/visualize_image_snippets_in_vertex/create_vertex_dataset.md
 create mode 100644 street_view_insights/samples/visualize_image_snippets_in_vertex/create_vertex_dataset.py

diff --git a/street_view_insights/samples/visualize_image_snippets_in_vertex/create_vertex_dataset.md b/street_view_insights/samples/visualize_image_snippets_in_vertex/create_vertex_dataset.md
new file mode 100644
index 0000000..1867e80
--- /dev/null
+++ b/street_view_insights/samples/visualize_image_snippets_in_vertex/create_vertex_dataset.md
@@ -0,0 +1,57 @@
+# Export Street View Insights to Vertex AI Manged Datasets
+
+This directory contains `create_vertex_dataset.py`, a utility script designed to fetch Street View image URLs (`gcs_uri`s) from a BigQuery observations table and seamlessly format them into a JSONL file that complies with Vertex AI's import requirements for image datasets. 
+
+Once generated, the JSONL file is uploaded to your specified Google Cloud Storage bucket so it can be ingested by Vertex AI Managed Datasets for training or visualization.
+
+## Prerequisites
+
+- You must have the Google Cloud SDK (`gcloud`) installed and authorized.
+- The default GCP Project must be correctly configured: `gcloud config set project PROJECT_ID`
+- The following Python packages must be installed: 
+  `pip install google-cloud-bigquery google-cloud-storage`
+
+## 1. Configure the Variables
+
+Open the `create_vertex_dataset.py` script and examine the Default Configuration variables explicitly defined at the top of the file. You may modify these directly in the code or pass them via command-line arguments.
+
+The globally defined key variables are:
+- `PROJECT_ID`: The GCP Project ID (e.g., `imagery-insights-d1xs9z`)
+- `TABLE_ID`: The fully-qualified BigQuery table from which to fetch the raw image URIs. Ensure the table contains a column named `gcs_uri` (e.g., `sarthaks-lab.imagery_insights___preview___us.latest_observations`)
+- `DATASET_NAME`: The prefix name of the generated dataset (determines the output JSONL filename prefix).
+- `BUCKET_NAME`: The destination Google Cloud Storage bucket where the JSONL file will be uploaded (e.g., `god_level_bucket`).
+- `GCS_DESTINATION_FOLDER`: The folder *inside* the GCS bucket where the JSONL file will be stored (e.g., `misc`).
+- `LIMIT_URLS`: The maximum number of URLs to fetch from the table (useful for creating small sample datasets).
+
+## 2. Run the Script
+
+Run the script from your terminal:
+
+```bash
+# Run with default variables defined in the script file
+python3 create_vertex_dataset.py
+
+# Or, override the variables using command-line arguments
+python3 create_vertex_dataset.py \
+    --dataset_name custom_streetview_dataset \
+    --limit 500 \
+    --table_id "PROJECT_ID.imagery_insights___preview___us.latest_observations"
+```
+
+The script will query BigQuery, locally generate a JSONL file ending with a random 4-letter alphanumeric suffix to guarantee uniqueness (e.g. `imagery_insights_sample_10_abcd.jsonl`), and upload it to your destination GCS bucket. 
+
+At the end of the script's output, it will print the **GCS URI** of the uploaded file. **Copy this URI**, you will need it in the next step.
+
+## 3. Import the file in Vertex AI
+
+To visualize the resulting dataset within Vertex AI for analysis or training, follow the standard Managed Dataset import flow. For detailed documentation, see the official [Vertex AI Docs](https://cloud.google.com/vertex-ai/docs/training/using-managed-datasets).
+
+1. Navigate to the Google Cloud Console and open **Vertex AI > Datasets**.
+2. Click **+ CREATE** at the top.
+3. Name your dataset and select **Image** > **Image classification (Single-label)** or whichever data type is most appropriate for your application, and select your project's region.
+4. Click **Create** to proceed to the Data Import screen.
+5. Choose **Select import files from Cloud Storage**.
+6. Paste the **GCS URI** that was copied at the end of Step 2 (e.g., `gs://god_level_bucket/misc/imagery_insights_sample_10_abcd.jsonl`) into the *Import file path* box.
+7. Click **Continue**.
+
+Vertex AI will parse the JSONL file and begin pulling the image URLs into the dataset interface. Once the ingestion job finishes, you can explore the Street View data visually within the UI.
diff --git a/street_view_insights/samples/visualize_image_snippets_in_vertex/create_vertex_dataset.py b/street_view_insights/samples/visualize_image_snippets_in_vertex/create_vertex_dataset.py
new file mode 100644
index 0000000..5f559dc
--- /dev/null
+++ b/street_view_insights/samples/visualize_image_snippets_in_vertex/create_vertex_dataset.py
@@ -0,0 +1,117 @@
+import os
+import json
+import argparse
+import random
+import string
+from google.cloud import bigquery
+from google.cloud import storage
+
+# Default configuration
+PROJECT_ID = "imagery-insights-d1xs9z"
+TABLE_ID = "sarthaks-lab.imagery_insights___preview___us.latest_observations"
+DATASET_NAME = "imagery_insights_sample"
+BUCKET_NAME = "god_level_bucket"
+GCS_DESTINATION_FOLDER = "misc"
+LIMIT_URLS = 10
+
+def export_to_vertex_jsonl(project_id, table_id, dataset_name, bucket_name, gcs_folder, output_filename, include_labels, limit):
+    """
+    Queries BigQuery for image URLs and creates a JSONL file for Vertex AI Managed Datasets.
+    """
+    print(f"Querying BigQuery table: {table_id} with limit: {limit}")
+    bq_client = bigquery.Client(project=project_id)
+    
+    limit_clause = f"LIMIT {limit}" if limit and limit > 0 else ""
+
+    if include_labels:
+        # Group by gcs_uri and collect unique labels for multi-label image classification
+        query = f"""
+            SELECT 
+                gcs_uri, 
+                ARRAY_AGG(DISTINCT asset_type IGNORE NULLS) as labels
+            FROM `{table_id}`
+            WHERE gcs_uri IS NOT NULL AND gcs_uri LIKE 'gs://%'
+            GROUP BY gcs_uri
+            {limit_clause}
+        """
+        print("Including labels (multi-label image classification format)...")
+    else:
+        # Just grab unique image URIs for an unlabelled dataset
+        query = f"""
+            SELECT DISTINCT gcs_uri 
+            FROM `{table_id}`
+            WHERE gcs_uri IS NOT NULL AND gcs_uri LIKE 'gs://%'
+            {limit_clause}
+        """
+        print("Extracting only image URIs (unlabeled dataset format)...")
+        
+    query_job = bq_client.query(query)
+    results = query_job.result()
+    
+    print(f"Writing results to local file: {output_filename}")
+    count = 0
+    with open(output_filename, 'w') as f:
+        for row in results:
+            if include_labels:
+                # Vertex AI multi-label image classification format
+                annotations = [{"displayName": label} for label in row.labels if label]
+                if not annotations:
+                    continue # Skip images with no labels if we explicitly requested labels
+                json_record = {
+                    "imageGcsUri": row.gcs_uri,
+                    "classificationAnnotations": annotations
+                }
+            else:
+                # Vertex AI unlabeled image format
+                json_record = {
+                    "imageGcsUri": row.gcs_uri
+                }
+            f.write(json.dumps(json_record) + "\n")
+            count += 1
+            
+    print(f"Successfully wrote {count} records to {output_filename}")
+    
+    if count == 0:
+        print("No records found. Exiting without uploading.")
+        return
+
+    # Upload to GCS
+    destination_blob_name = f"{gcs_folder}/{output_filename}" if gcs_folder else output_filename
+    print(f"Uploading {output_filename} to gs://{bucket_name}/{destination_blob_name} ...")
+    
+    storage_client = storage.Client(project=project_id)
+    bucket = storage_client.bucket(bucket_name)
+    blob = bucket.blob(destination_blob_name)
+    
+    blob.upload_from_filename(output_filename)
+    
+    print(f"Upload complete!")
+    print(f"GCS URI: gs://{bucket_name}/{destination_blob_name}")
+    print("You can now use this URI to import data into a Managed Dataset in Vertex AI.")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Export image URLs from BigQuery to a JSONL file for Vertex AI.")
+    parser.add_argument("--project_id", type=str, default=PROJECT_ID, help="Google Cloud Project ID")
+    parser.add_argument("--table_id", type=str, default=TABLE_ID, help="BigQuery Table ID")
+    parser.add_argument("--dataset_name", type=str, default=DATASET_NAME, help="Name of the Dataset (controls output file prefix)")
+    parser.add_argument("--bucket_name", type=str, default=BUCKET_NAME, help="Destination GCS Bucket Name")
+    parser.add_argument("--gcs_folder", type=str, default=GCS_DESTINATION_FOLDER, help="Destination GCS Folder Path")
+    parser.add_argument("--limit", type=int, default=LIMIT_URLS, help="Maximum number of URLs to fetch")
+    parser.add_argument("--include_labels", action="store_true", help="Include asset_class as labels in Vertex AI Multi-Label Classification format.")
+    
+    args = parser.parse_args()
+    
+    # Construct dynamic output filename: dataset_limit_random4.jsonl
+    random_suffix = ''.join(random.choices(string.ascii_lowercase, k=4))
+    out_file = f"{args.dataset_name}_{args.limit}_{random_suffix}.jsonl"
+    
+    export_to_vertex_jsonl(
+        project_id=args.project_id,
+        table_id=args.table_id,
+        dataset_name=args.dataset_name,
+        bucket_name=args.bucket_name,
+        gcs_folder=args.gcs_folder,
+        output_filename=out_file,
+        include_labels=args.include_labels,
+        limit=args.limit
+    )