CERNDocumentServer · TahaKhan998 · Apr 20, 2026 · kpsherva · Apr 22, 2026 · TahaKhan998
diff --git a/invenio.cfg b/invenio.cfg
@@ -19,7 +19,8 @@ from cds_rdm.permissions import (
     CDSRequestsPermissionPolicy,
     CDSRDMPreservationSyncPermissionPolicy,
     lock_edit_record_published_files,
-    CDSAuditLogPermissionPolicy
+    CDSAuditLogPermissionPolicy,
+    CDSJobLogsPermissionPolicy,
 )
 from cds_rdm.files import storage_factory
 from cds_rdm.inspire_harvester.reader import InspireHTTPReader
@@ -774,3 +775,4 @@ AUDIT_LOGS_ENABLED = True
 RDM_ALLOW_OWNERS_REMOVE_COMMUNITY_FROM_RECORD = False
 
 AUDIT_LOGS_PERMISSION_POLICY = CDSAuditLogPermissionPolicy
+APP_LOGS_PERMISSION_POLICY = CDSJobLogsPermissionPolicy
diff --git a/site/cds_rdm/administration/harvester_reports.py b/site/cds_rdm/administration/harvester_reports.py
@@ -125,5 +125,8 @@ def init_search_config(self, **kwargs):
             headers=self.get_search_request_headers(**kwargs),
             pagination_options=(20, 50),
             default_size=20,
-            hidden_params=[["action", "record.publish"]],
-        )
+            hidden_params=[
+                ["action", "record.publish"],
+                ["user_id", "system"],
+            ],
+        )
diff --git a/site/cds_rdm/assets/semantic-ui/js/cds_rdm/administration/harvesterReports/DownloadButton.js b/site/cds_rdm/assets/semantic-ui/js/cds_rdm/administration/harvesterReports/DownloadButton.js
@@ -8,25 +8,29 @@ import React from "react";
 import { withState } from "react-searchkit";
 import { Button, Icon } from "semantic-ui-react";
 import { i18next } from "@translations/invenio_administration/i18next";
+import { extractRunIdFromQuery } from "./utils";
 
 const DownloadButtonComponent = ({ currentQueryState }) => {
-  const handleDownload = () => {
-    const query = currentQueryState.queryString || "";
-    const hiddenParams = currentQueryState.hiddenParams || [];
+  const domContainer = document.getElementById("invenio-search-config");
+  const runs = JSON.parse(domContainer?.dataset.harvesterRuns || "[]");
 
-    const params = new URLSearchParams();
-    if (query) params.set("q", query);
-    hiddenParams.forEach(([key, value]) => params.append(key, value));
+  const runId = extractRunIdFromQuery(
+    currentQueryState.queryString || "",
+    runs
+  );
 
-    const downloadUrl = `/harvester-reports/download?${params.toString()}`;
-    window.location.href = downloadUrl;
+  const handleDownload = () => {
+    if (!runId) return;
+    const params = new URLSearchParams({ run_id: runId });
+    window.location.href = `/harvester-reports/download?${params.toString()}`;
   };
 
   return (
     <Button
       icon
       labelPosition="left"
       onClick={handleDownload}
+      disabled={!runId}
       className="harvester-download-button"
       size="small"
     >

diff --git a/site/cds_rdm/assets/semantic-ui/js/cds_rdm/administration/harvesterReports/SearchBar.js b/site/cds_rdm/assets/semantic-ui/js/cds_rdm/administration/harvesterReports/SearchBar.js
@@ -16,6 +16,11 @@ import { DownloadButton } from "./DownloadButton";
  * Custom SearchBar component with run selector
  */
 const SearchBarComponent = ({ updateQueryState, currentQueryState }) => {
+  const hiddenParams = [
+    ["action", "record.publish"],
+    ["user_id", "system"],
+  ];
+
   // Get runs from data attributes
   const domContainer = document.getElementById("invenio-search-config");
   const runs = JSON.parse(domContainer?.dataset.harvesterRuns || "[]");
@@ -48,7 +53,7 @@ const SearchBarComponent = ({ updateQueryState, currentQueryState }) => {
     updateQueryState({
       ...currentQueryState,
       queryString,
-      hiddenParams: [["action", "record.publish"]],
+      hiddenParams,
     });
   };
 
@@ -61,7 +66,7 @@ const SearchBarComponent = ({ updateQueryState, currentQueryState }) => {
     updateQueryState({
       ...currentQueryState,
       queryString: inputValue,
-      hiddenParams: [["action", "record.publish"]],
+      hiddenParams,
     });
   };
 

diff --git a/site/cds_rdm/generators.py b/site/cds_rdm/generators.py
@@ -117,10 +117,15 @@ def needs(self, **kwargs):
         return [harvester_admin_access_action]
 
     def query_filter(self, identity=None, **kwargs):
-        """Restrict harvester curators to system user audit logs only."""
-        for need in identity.provides:
-            if need == harvester_admin_access_action:
-                return dsl.Q("term", **{"user.id": "system"})
+        """Restrict harvester curators to system-user ``record.publish`` audit logs."""
+        if identity and Permission(harvester_admin_access_action).allows(identity):
+            return dsl.Q(
+                "bool",
+                must=[
+                    dsl.Q("term", **{"user.id": "system"}),
+                    dsl.Q("term", action="record.publish"),
+                ],
+            )
         return []
 
 

diff --git a/site/cds_rdm/harvester_download/resources/resource.py b/site/cds_rdm/harvester_download/resources/resource.py
@@ -7,14 +7,20 @@
 
 """Harvester download resource."""
 
+import re
+import uuid
 from datetime import datetime
 
-from flask import Response, g, request, stream_with_context
+from flask import Response, current_app, request
 from flask_resources import Resource, route
-from invenio_audit_logs.proxies import current_audit_logs_service
+from invenio_access.permissions import system_identity
+from invenio_jobs.models import Run
+from invenio_jobs.proxies import current_jobs_logs_service
 
 from cds_rdm.administration.permissions import curators_permission
 
+INSPIRE_HARVESTER_TASK = "process_inspire"
+
 
 class HarvesterDownloadResource(Resource):
     """Harvester download resource."""
@@ -27,44 +33,129 @@ def create_url_rules(self):
         ]
 
     def download(self):
-        """Download audit logs for harvester reports as plain text file."""
+        """Download a harvester run's logs as a plain-text ``.log`` file.
+
+        Mirrors the admin job-run page: status header, failure banner,
+        truncation warning, and task-grouped entries formatted as
+        ``[yyyy-MM-dd HH:mm] LEVEL message``.
+        """
         permission = curators_permission
         if not permission.can():
             return {"message": "Permission denied"}, 403
 
-        query = request.args.get("q", "")
-        action = request.args.get("action", "")
-
-        if not query:
-            return {"message": "No query provided"}, 400
-
-        params = {"q": query, "size": 1000}
-        if action:
-            params["action"] = action
-
-        result = current_audit_logs_service.search(
-            identity=g.identity,
-            params=params,
-        )
-
-        def generate_logs():
-            """Generate log lines one by one."""
-            for hit in result.hits:
-                timestamp = hit.get("created", "N/A")
-                action = hit.get("action", "N/A")
-                resource_type = hit.get("resource", {}).get("type", "N/A")
-                resource_id = hit.get("resource", {}).get("id", "N/A")
-                user_email = hit.get("user", {}).get("email", "N/A")
-
-                # Format: [timestamp] action resource_type/resource_id user
-                line = f"[{timestamp}] {action} {resource_type}/{resource_id} {user_email}\n"
-                yield line
+        run_id = request.args.get("run_id", "").strip()
+        if not run_id:
+            return {"message": "Missing run_id"}, 400
+        try:
+            uuid.UUID(run_id)
+        except ValueError:
+            return {"message": "Invalid run_id"}, 400
+
+        run = Run.query.filter_by(id=run_id, parent_run_id=None).one_or_none()
+        if not run:
+            return {"message": "Run not found"}, 404
+
+        if not run.job or run.job.task != INSPIRE_HARVESTER_TASK:
+            return {"message": "Run is not a harvester run"}, 404
+
+
+        max_results = current_app.config.get("JOBS_LOGS_MAX_RESULTS", 2000)
+        try:
+            result = current_jobs_logs_service.search(
+                system_identity,
+                params={"q": str(run.id), "sort": "timestamp"},
+            )
+            hits = list(result.hits)
+            total = result.total or len(hits)
+        except Exception:
+            current_app.logger.exception(
+                "Failed to fetch structured job logs for harvester run %s", run.id
+            )
+            hits = []
+            total = 0
+
+        def _format_timestamp(raw):
+            # Admin UI (RunsLogs.js) format.
+            if not raw:
+                return "N/A"
+            try:
+                return datetime.fromisoformat(
+                    raw.replace("Z", "+00:00")
+                ).strftime("%Y-%m-%d %H:%M")
+            except (ValueError, TypeError):
+                return raw
+
+        # Group by context.task_id in first-seen order (RunsLogs.js buildLogTree).
+        task_groups = {}
+        seen = set()
+        error_count = 0
+        warning_count = 0
+        for hit in hits:
+            raw_ts = hit.get("timestamp")
+            level = hit.get("level", "INFO")
+            # Collapse whitespace so multi-line errors render on one line
+            # (admin UI does the same via ``white-space: normal``).
+            message = re.sub(r"\s+", " ", (hit.get("message") or "")).strip()
+            key = (raw_ts, level, message)
+            if key in seen:
+                continue
+            seen.add(key)
+            if level == "ERROR":
+                error_count += 1
+            elif level == "WARNING":
+                warning_count += 1
+            task_id = (hit.get("context") or {}).get("task_id") or "unknown"
+            task_groups.setdefault(task_id, []).append(
+                f"[{_format_timestamp(raw_ts)}] {level} {message}"
+            )
+
+        lines = [line for group in task_groups.values() for line in group]
+
+        header = []
+        status = getattr(run.status, "name", str(run.status))
+        header.append(f"Status: {status}")
+        header.append(f"Started: {_format_timestamp(run.started_at.isoformat())}")
+        if run.finished_at:
+            header.append(
+                f"Finished: {_format_timestamp(run.finished_at.isoformat())}"
+            )
+
+        summary = []
+        if status in ("FAILED", "PARTIAL_SUCCESS", "SUCCESS"):
+            summary.append(
+                {
+                    "FAILED": "Job failed",
+                    "PARTIAL_SUCCESS": "Job partially succeeded",
+                    "SUCCESS": "Job completed successfully",
+                }[status]
+            )
+        if run.message:
+            summary.append(run.message)
+        if error_count:
+            summary.append(f"{error_count} error(s) found in logs below")
+        if warning_count:
+            summary.append(f"{warning_count} warning(s) found in logs below")
+        if summary:
+            header.append("")
+            header.extend(summary)
+
+        if total and total > len(lines):
+            header.append(
+                f"Showing first {len(lines)} of {total} log entries "
+                f"(truncated at JOBS_LOGS_MAX_RESULTS={max_results})."
+            )
+        header.append("=" * 80)
+
+        logs = "\n".join(header + lines)
+
+        if not lines:
+            logs += "\n" + (run.message or "No logs available for this run.\n")
 
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        filename = f"harvester_logs_{timestamp}.txt"
+        filename = f"harvester_logs_{run.id}_{timestamp}.log"
 
         return Response(
-            stream_with_context(generate_logs()),
+            logs,
             mimetype="text/plain",
             headers={"Content-Disposition": f'attachment; filename="{filename}"'},
         )
diff --git a/site/cds_rdm/permissions.py b/site/cds_rdm/permissions.py
@@ -12,6 +12,7 @@
 from invenio_administration.permissions import administration_permission
 from invenio_audit_logs.services.permissions import AuditLogPermissionPolicy
 from invenio_communities.permissions import CommunityPermissionPolicy
+from invenio_jobs.services.permissions import JobLogsPermissionPolicy
 from invenio_preservation_sync.services.permissions import (
     DefaultPreservationInfoPermissionPolicy,
 )
@@ -71,6 +72,12 @@ class CDSRDMRecordPermissionPolicy(RDMRecordPermissionPolicy):
     can_create = [AuthenticatedRegularUser(), SystemProcess()]
     can_read = RDMRecordPermissionPolicy.can_read + [ArchiverRead()]
     can_search = RDMRecordPermissionPolicy.can_search + [ArchiverRead()]
+    # TODO: Restrict this path so harvester curators only see system-made
+    # revisions from Harvester Reports "View Changes", not the full revision
+    # history for the record. See #783.
+    can_search_revisions = RDMRecordPermissionPolicy.can_search_revisions + [
+        HarvesterCurator()
+    ]
     can_read_files = RDMRecordPermissionPolicy.can_read_files + [ArchiverRead()]
     can_get_content_files = RDMRecordPermissionPolicy.can_get_content_files + [
         ArchiverRead()
@@ -111,6 +118,11 @@ class CDSAuditLogPermissionPolicy(AuditLogPermissionPolicy):
     can_read = AuditLogPermissionPolicy.can_read + [HarvesterCurator()]
 
 
+class CDSJobLogsPermissionPolicy(JobLogsPermissionPolicy):
+    """Job logs permission policy for CDS."""
+
+    can_read = JobLogsPermissionPolicy.can_search
+
 class CDSRequestsPermissionPolicy(RDMRequestsPermissionPolicy):
     """Requests permission policy."""
 

diff --git a/site/cds_rdm/templates/semantic-ui/invenio_jobs/emails/run_notification.html b/site/cds_rdm/templates/semantic-ui/invenio_jobs/emails/run_notification.html
@@ -31,14 +31,6 @@ <h3 style="margin-top: 0; color: #856404; font-size: 16px;">Summary</h3>
 </div>
 {% endif %}
 
-<!-- Call to action button -->
-<div style="text-align: center; margin: 30px 0;">
-    <a href="{{ run_url }}"
-        style="display: inline-block; padding: 12px 30px; background-color: {{ status_info.color }}; color: white; text-decoration: none; border-radius: 5px; font-weight: bold; font-size: 16px;">
-        View Full Details
-    </a>
-</div>
-
 {% if job.task == "process_inspire" %}
 <!-- Harvester-specific actions (only for INSPIRE harvester jobs) -->
 {% set start_time = run.started_at | string | replace(' ', 'T') %}
@@ -122,4 +114,4 @@ <h3 style="margin-top: 0; color: #333; font-size: 16px;">Harvester Actions</h3>
         {% endif %}
     </details>
 </div>
-{% endblock content %}
+{% endblock content %}
diff --git a/site/cds_rdm/templates/semantic-ui/invenio_jobs/emails/run_notification.txt b/site/cds_rdm/templates/semantic-ui/invenio_jobs/emails/run_notification.txt
@@ -12,8 +12,6 @@
 
 {{ status_info.action }}
 
-View full details: {{ run_url }}
-
 {% if run.status.name == "PARTIAL_SUCCESS" and run.errored_entries and run.total_entries %}
 Summary:
 - Successfully processed: {{ success_count }} items