Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion invenio.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ from cds_rdm.permissions import (
CDSRequestsPermissionPolicy,
CDSRDMPreservationSyncPermissionPolicy,
lock_edit_record_published_files,
CDSAuditLogPermissionPolicy
CDSAuditLogPermissionPolicy,
CDSJobLogsPermissionPolicy,
)
from cds_rdm.files import storage_factory
from cds_rdm.inspire_harvester.reader import InspireHTTPReader
Expand Down Expand Up @@ -774,3 +775,4 @@ AUDIT_LOGS_ENABLED = True
RDM_ALLOW_OWNERS_REMOVE_COMMUNITY_FROM_RECORD = False

AUDIT_LOGS_PERMISSION_POLICY = CDSAuditLogPermissionPolicy
APP_LOGS_PERMISSION_POLICY = CDSJobLogsPermissionPolicy
7 changes: 5 additions & 2 deletions site/cds_rdm/administration/harvester_reports.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,5 +125,8 @@ def init_search_config(self, **kwargs):
headers=self.get_search_request_headers(**kwargs),
pagination_options=(20, 50),
default_size=20,
hidden_params=[["action", "record.publish"]],
)
hidden_params=[
["action", "record.publish"],
["user_id", "system"],
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

did you check by any chance what does the REST API return for audit logs? we need to make sure REST API does not return wrong entries for curators role (on audit logs endpoint directly, not on this admin one)

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I checked the audit logs API. The issue was in HarvesterCurator.query_filter in generators.py. It was looking at identity.provides the wrong way so the filter never really worked and curators could call GET /api/audit-logs and get way too much back. I fixed it to use Permission(harvester_admin_access_action).allows(identity) and only return system user and record.publish like Harvester Reports. Updated test_harvester_curator_permissions too.

],
)
Original file line number Diff line number Diff line change
Expand Up @@ -8,25 +8,29 @@ import React from "react";
import { withState } from "react-searchkit";
import { Button, Icon } from "semantic-ui-react";
import { i18next } from "@translations/invenio_administration/i18next";
import { extractRunIdFromQuery } from "./utils";

const DownloadButtonComponent = ({ currentQueryState }) => {
const handleDownload = () => {
const query = currentQueryState.queryString || "";
const hiddenParams = currentQueryState.hiddenParams || [];
const domContainer = document.getElementById("invenio-search-config");
const runs = JSON.parse(domContainer?.dataset.harvesterRuns || "[]");

const params = new URLSearchParams();
if (query) params.set("q", query);
hiddenParams.forEach(([key, value]) => params.append(key, value));
const runId = extractRunIdFromQuery(
currentQueryState.queryString || "",
runs
);

const downloadUrl = `/harvester-reports/download?${params.toString()}`;
window.location.href = downloadUrl;
const handleDownload = () => {
if (!runId) return;
const params = new URLSearchParams({ run_id: runId });
window.location.href = `/harvester-reports/download?${params.toString()}`;
};

return (
<Button
icon
labelPosition="left"
onClick={handleDownload}
disabled={!runId}
className="harvester-download-button"
size="small"
>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@ import { DownloadButton } from "./DownloadButton";
* Custom SearchBar component with run selector
*/
const SearchBarComponent = ({ updateQueryState, currentQueryState }) => {
const hiddenParams = [
["action", "record.publish"],
["user_id", "system"],
];

// Get runs from data attributes
const domContainer = document.getElementById("invenio-search-config");
const runs = JSON.parse(domContainer?.dataset.harvesterRuns || "[]");
Expand Down Expand Up @@ -48,7 +53,7 @@ const SearchBarComponent = ({ updateQueryState, currentQueryState }) => {
updateQueryState({
...currentQueryState,
queryString,
hiddenParams: [["action", "record.publish"]],
hiddenParams,
});
};

Expand All @@ -61,7 +66,7 @@ const SearchBarComponent = ({ updateQueryState, currentQueryState }) => {
updateQueryState({
...currentQueryState,
queryString: inputValue,
hiddenParams: [["action", "record.publish"]],
hiddenParams,
});
};

Expand Down
13 changes: 9 additions & 4 deletions site/cds_rdm/generators.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,10 +117,15 @@ def needs(self, **kwargs):
return [harvester_admin_access_action]

def query_filter(self, identity=None, **kwargs):
"""Restrict harvester curators to system user audit logs only."""
for need in identity.provides:
if need == harvester_admin_access_action:
return dsl.Q("term", **{"user.id": "system"})
"""Restrict harvester curators to system-user ``record.publish`` audit logs."""
if identity and Permission(harvester_admin_access_action).allows(identity):
return dsl.Q(
"bool",
must=[
dsl.Q("term", **{"user.id": "system"}),
dsl.Q("term", action="record.publish"),
],
)
return []


Expand Down
155 changes: 123 additions & 32 deletions site/cds_rdm/harvester_download/resources/resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,20 @@

"""Harvester download resource."""

import re
import uuid
from datetime import datetime

from flask import Response, g, request, stream_with_context
from flask import Response, current_app, request
from flask_resources import Resource, route
from invenio_audit_logs.proxies import current_audit_logs_service
from invenio_access.permissions import system_identity
from invenio_jobs.models import Run
from invenio_jobs.proxies import current_jobs_logs_service

from cds_rdm.administration.permissions import curators_permission

INSPIRE_HARVESTER_TASK = "process_inspire"


class HarvesterDownloadResource(Resource):
"""Harvester download resource."""
Expand All @@ -27,44 +33,129 @@ def create_url_rules(self):
]

def download(self):
"""Download audit logs for harvester reports as plain text file."""
"""Download a harvester run's logs as a plain-text ``.log`` file.

Mirrors the admin job-run page: status header, failure banner,
truncation warning, and task-grouped entries formatted as
``[yyyy-MM-dd HH:mm] LEVEL message``.
"""
permission = curators_permission
if not permission.can():
return {"message": "Permission denied"}, 403

query = request.args.get("q", "")
action = request.args.get("action", "")

if not query:
return {"message": "No query provided"}, 400

params = {"q": query, "size": 1000}
if action:
params["action"] = action

result = current_audit_logs_service.search(
identity=g.identity,
params=params,
)

def generate_logs():
"""Generate log lines one by one."""
for hit in result.hits:
timestamp = hit.get("created", "N/A")
action = hit.get("action", "N/A")
resource_type = hit.get("resource", {}).get("type", "N/A")
resource_id = hit.get("resource", {}).get("id", "N/A")
user_email = hit.get("user", {}).get("email", "N/A")

# Format: [timestamp] action resource_type/resource_id user
line = f"[{timestamp}] {action} {resource_type}/{resource_id} {user_email}\n"
yield line
run_id = request.args.get("run_id", "").strip()
if not run_id:
return {"message": "Missing run_id"}, 400
try:
uuid.UUID(run_id)
except ValueError:
return {"message": "Invalid run_id"}, 400

run = Run.query.filter_by(id=run_id, parent_run_id=None).one_or_none()
if not run:
return {"message": "Run not found"}, 404

if not run.job or run.job.task != INSPIRE_HARVESTER_TASK:
return {"message": "Run is not a harvester run"}, 404


max_results = current_app.config.get("JOBS_LOGS_MAX_RESULTS", 2000)
try:
result = current_jobs_logs_service.search(
system_identity,
params={"q": str(run.id), "sort": "timestamp"},
)
hits = list(result.hits)
total = result.total or len(hits)
except Exception:
current_app.logger.exception(
"Failed to fetch structured job logs for harvester run %s", run.id
)
hits = []
total = 0

def _format_timestamp(raw):
# Admin UI (RunsLogs.js) format.
if not raw:
return "N/A"
try:
return datetime.fromisoformat(
raw.replace("Z", "+00:00")
).strftime("%Y-%m-%d %H:%M")
except (ValueError, TypeError):
return raw

# Group by context.task_id in first-seen order (RunsLogs.js buildLogTree).
task_groups = {}
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wasn't this grouping already done somewhere in the code? I am having flashbacks :)

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, the admin view already does that grouping in RunsLogs.js. The download is just plain text though, so I repeated the same task_id grouping here so the file looks like what you see in admin.

seen = set()
error_count = 0
warning_count = 0
for hit in hits:
raw_ts = hit.get("timestamp")
level = hit.get("level", "INFO")
# Collapse whitespace so multi-line errors render on one line
# (admin UI does the same via ``white-space: normal``).
message = re.sub(r"\s+", " ", (hit.get("message") or "")).strip()
key = (raw_ts, level, message)
if key in seen:
continue
seen.add(key)
if level == "ERROR":
error_count += 1
elif level == "WARNING":
warning_count += 1
task_id = (hit.get("context") or {}).get("task_id") or "unknown"
task_groups.setdefault(task_id, []).append(
f"[{_format_timestamp(raw_ts)}] {level} {message}"
)

lines = [line for group in task_groups.values() for line in group]

header = []
status = getattr(run.status, "name", str(run.status))
header.append(f"Status: {status}")
header.append(f"Started: {_format_timestamp(run.started_at.isoformat())}")
if run.finished_at:
header.append(
f"Finished: {_format_timestamp(run.finished_at.isoformat())}"
)

summary = []
if status in ("FAILED", "PARTIAL_SUCCESS", "SUCCESS"):
summary.append(
{
"FAILED": "Job failed",
"PARTIAL_SUCCESS": "Job partially succeeded",
"SUCCESS": "Job completed successfully",
}[status]
)
if run.message:
summary.append(run.message)
if error_count:
summary.append(f"{error_count} error(s) found in logs below")
if warning_count:
summary.append(f"{warning_count} warning(s) found in logs below")
if summary:
header.append("")
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How does the end result look like? Can you attach an example file?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

header.extend(summary)

if total and total > len(lines):
header.append(
f"Showing first {len(lines)} of {total} log entries "
f"(truncated at JOBS_LOGS_MAX_RESULTS={max_results})."
)
header.append("=" * 80)

logs = "\n".join(header + lines)

if not lines:
logs += "\n" + (run.message or "No logs available for this run.\n")

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"harvester_logs_{timestamp}.txt"
filename = f"harvester_logs_{run.id}_{timestamp}.log"

return Response(
stream_with_context(generate_logs()),
logs,
mimetype="text/plain",
headers={"Content-Disposition": f'attachment; filename="{filename}"'},
)
12 changes: 12 additions & 0 deletions site/cds_rdm/permissions.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from invenio_administration.permissions import administration_permission
from invenio_audit_logs.services.permissions import AuditLogPermissionPolicy
from invenio_communities.permissions import CommunityPermissionPolicy
from invenio_jobs.services.permissions import JobLogsPermissionPolicy
from invenio_preservation_sync.services.permissions import (
DefaultPreservationInfoPermissionPolicy,
)
Expand Down Expand Up @@ -71,6 +72,12 @@ class CDSRDMRecordPermissionPolicy(RDMRecordPermissionPolicy):
can_create = [AuthenticatedRegularUser(), SystemProcess()]
can_read = RDMRecordPermissionPolicy.can_read + [ArchiverRead()]
can_search = RDMRecordPermissionPolicy.can_search + [ArchiverRead()]
# TODO: Restrict this path so harvester curators only see system-made
# revisions from Harvester Reports "View Changes", not the full revision
# history for the record. See #783.
can_search_revisions = RDMRecordPermissionPolicy.can_search_revisions + [
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this needed?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So curators can use the "View Changes" button on the Harvester Reports page.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you make sure they can only see revisions made by system user?
if they can see more, let's discuss on how to tackle it before jumping to implementations

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

discussed in person - we leave it as is - just for bookkeeping reasons, can you add a todo here to implement more restrictive permissions (plus please create an issue pointing to this todo, and cross-link)

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

HarvesterCurator()
]
can_read_files = RDMRecordPermissionPolicy.can_read_files + [ArchiverRead()]
can_get_content_files = RDMRecordPermissionPolicy.can_get_content_files + [
ArchiverRead()
Expand Down Expand Up @@ -111,6 +118,11 @@ class CDSAuditLogPermissionPolicy(AuditLogPermissionPolicy):
can_read = AuditLogPermissionPolicy.can_read + [HarvesterCurator()]


class CDSJobLogsPermissionPolicy(JobLogsPermissionPolicy):
"""Job logs permission policy for CDS."""

can_read = JobLogsPermissionPolicy.can_search

class CDSRequestsPermissionPolicy(RDMRequestsPermissionPolicy):
"""Requests permission policy."""

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,6 @@ <h3 style="margin-top: 0; color: #856404; font-size: 16px;">Summary</h3>
</div>
{% endif %}

<!-- Call to action button -->
<div style="text-align: center; margin: 30px 0;">
<a href="{{ run_url }}"
style="display: inline-block; padding: 12px 30px; background-color: {{ status_info.color }}; color: white; text-decoration: none; border-radius: 5px; font-weight: bold; font-size: 16px;">
View Full Details
</a>
</div>

{% if job.task == "process_inspire" %}
<!-- Harvester-specific actions (only for INSPIRE harvester jobs) -->
{% set start_time = run.started_at | string | replace(' ', 'T') %}
Expand Down Expand Up @@ -122,4 +114,4 @@ <h3 style="margin-top: 0; color: #333; font-size: 16px;">Harvester Actions</h3>
{% endif %}
</details>
</div>
{% endblock content %}
{% endblock content %}
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@

{{ status_info.action }}

View full details: {{ run_url }}

{% if run.status.name == "PARTIAL_SUCCESS" and run.errored_entries and run.total_entries %}
Summary:
- Successfully processed: {{ success_count }} items
Expand Down
Loading
Loading