Skip to content

Commit c41b3b3

Browse files
committed
feat(analytics): refactor bot event filtering to use is_bot_name utility and enhance tests
1 parent 2b38c28 commit c41b3b3

File tree

6 files changed

+184
-127
lines changed

6 files changed

+184
-127
lines changed

backend/analytics_server/mhq/service/code/sync/etl_code_analytics.py

Lines changed: 2 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
PullRequestState,
1111
)
1212
from mhq.utils.time import Interval
13-
import re
13+
from mhq.utils.string import is_bot_name
1414

1515

1616
class CodeETLAnalyticsService:
@@ -184,20 +184,4 @@ def filter_non_bot_events(
184184
self, pr_events: List[PullRequestEvent]
185185
) -> List[PullRequestEvent]:
186186
"""Filter out events created by bot users using regex patterns."""
187-
188-
bot_pattern = re.compile(
189-
r"bot|[bB][oO][tT]|\[bot\]|automated|jenkins|ci-|github-actions",
190-
re.IGNORECASE,
191-
)
192-
193-
return [
194-
event
195-
for event in pr_events
196-
if (
197-
not bool(bot_pattern.search(event.actor_username))
198-
and not (
199-
event.data.get("user")
200-
and event.data.get("user", {}).get("type") == "Bot"
201-
)
202-
)
203-
]
187+
return [event for event in pr_events if (not is_bot_name(event.actor_username))]

backend/analytics_server/mhq/service/code/sync/etl_github_handler.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -181,9 +181,9 @@ def process_pr(
181181
pr_commits_model_list: List[PullRequestCommit] = self._to_pr_commits(
182182
commits, pr_model
183183
)
184-
184+
non_bot_pr_events_model_list = self._github_bot_filter(pr_events_model_list)
185185
pr_model = self.code_etl_analytics_service.create_pr_metrics(
186-
pr_model, pr_events_model_list, pr_commits_model_list
186+
pr_model, non_bot_pr_events_model_list, pr_commits_model_list
187187
)
188188

189189
return pr_model, pr_events_model_list, pr_commits_model_list
@@ -354,6 +354,17 @@ def _dt_from_github_dt_string(dt_string: str) -> datetime:
354354
dt_without_timezone = datetime.strptime(dt_string, ISO_8601_DATE_FORMAT)
355355
return dt_without_timezone.replace(tzinfo=pytz.UTC)
356356

357+
@staticmethod
358+
def _github_bot_filter(pr_events: List[PullRequestEvent]) -> List[PullRequestEvent]:
359+
filtered_events = []
360+
for pr_event in pr_events:
361+
pr_event_data = (
362+
pr_event.data.get("user") or pr_event.data.get("actor") or {}
363+
)
364+
if not (pr_event_data.get("type") == "Bot"):
365+
filtered_events.append(pr_event)
366+
return filtered_events
367+
357368

358369
def get_github_etl_handler(org_id: str) -> GithubETLHandler:
359370
def _get_access_token():
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,13 @@
11
from uuid import uuid4
2+
import re
23

34

45
def uuid4_str():
56
return str(uuid4())
7+
8+
9+
def is_bot_name(name: str) -> bool:
10+
pattern = re.compile(
11+
r"(?i)(\b[\w@-]*[-_\[\]@ ]+bot[-_\d\[\]]*\b|\[bot\]|_bot_|_bot$|^bot_)"
12+
)
13+
return bool(pattern.search(name))

backend/analytics_server/tests/service/code/sync/test_etl_code_analytics.py

Lines changed: 84 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -502,129 +502,106 @@ def test_rework_cycles_returs_1_for_multiple_approvals():
502502
)
503503

504504

505-
def test_create_pr_metrics_filters_bot_events():
505+
def test_filter_non_bot_events_common_patterns():
506506
pr_service = CodeETLAnalyticsService()
507-
t1 = time_now()
508-
t2 = t1 + timedelta(hours=1)
509-
t3 = t2 + timedelta(hours=1)
510-
pr = get_pull_request(state=PullRequestState.MERGED, created_at=t1, updated_at=t1)
511-
bot_event = get_pull_request_event(
512-
pull_request_id=pr.id,
513-
reviewer="test-bot[bot]",
514-
state=PullRequestEventState.COMMENTED.value,
515-
created_at=t2,
516-
)
517-
human_event = get_pull_request_event(
518-
pull_request_id=pr.id,
519-
reviewer="human_user",
520-
state=PullRequestEventState.APPROVED.value,
521-
created_at=t3,
522-
)
523-
pr_metrics = pr_service.create_pr_metrics(pr, [bot_event, human_event], [])
524-
assert "human_user" in pr_metrics.reviewers
525-
assert "test-bot[bot]" not in pr_metrics.reviewers
526-
527507

528-
def test_create_pr_metrics_no_bot_first_response_time():
508+
bot_events = [
509+
get_pull_request_event(reviewer="github-actions[bot]"),
510+
get_pull_request_event(reviewer="jenkins-bot"),
511+
get_pull_request_event(reviewer="renovate[bot]"),
512+
get_pull_request_event(reviewer="test_bot_service"),
513+
get_pull_request_event(reviewer="my_bot"),
514+
get_pull_request_event(reviewer="bot_user"),
515+
get_pull_request_event(reviewer="SomeService-bot-123"),
516+
get_pull_request_event(reviewer="CI-BOT"),
517+
get_pull_request_event(reviewer="bot-123[bot]"),
518+
get_pull_request_event(reviewer="helper_bot_v2"),
519+
]
520+
human_events = [
521+
get_pull_request_event(reviewer="john_doe"),
522+
get_pull_request_event(reviewer="robotics_expert"),
523+
get_pull_request_event(reviewer="sabotage"),
524+
get_pull_request_event(reviewer="lobotomy"),
525+
get_pull_request_event(reviewer="cubot"),
526+
get_pull_request_event(reviewer="botany"),
527+
]
528+
529+
all_events = bot_events + human_events
530+
filtered_events = pr_service.filter_non_bot_events(all_events)
531+
532+
assert len(filtered_events) == len(human_events)
533+
534+
filtered_usernames = [e.actor_username for e in filtered_events]
535+
for event in bot_events:
536+
assert event.actor_username not in filtered_usernames
537+
538+
for event in human_events:
539+
assert event.actor_username in filtered_usernames
540+
541+
542+
def test_filter_non_bot_events_edge_cases():
529543
pr_service = CodeETLAnalyticsService()
530-
t1 = time_now()
531-
t2 = t1 + timedelta(hours=1)
532-
pr = get_pull_request(state=PullRequestState.MERGED, created_at=t1, updated_at=t1)
533-
first_review_event = get_pull_request_event(
534-
pull_request_id=pr.id,
535-
reviewer="reviewer",
536-
state=PullRequestEventState.COMMENTED.value,
537-
created_at=t2,
538-
)
539-
pr_metrics = pr_service.create_pr_metrics(pr, [first_review_event], [])
540-
assert pr_metrics.first_response_time == 3600
541-
542-
543-
def test_create_pr_metrics_no_bot_rework_time():
544-
pr_service = CodeETLAnalyticsService()
545-
t1 = time_now()
546-
t2 = t1 + timedelta(hours=1)
547-
t3 = t2 + timedelta(hours=1)
548-
pr = get_pull_request(state=PullRequestState.MERGED, created_at=t1, updated_at=t1)
549-
changes_requested_event = get_pull_request_event(
550-
pull_request_id=pr.id,
551-
reviewer="reviewer",
552-
state=PullRequestEventState.CHANGES_REQUESTED.value,
553-
created_at=t2,
554-
)
555-
approval_event = get_pull_request_event(
556-
pull_request_id=pr.id,
557-
reviewer="reviewer",
558-
state=PullRequestEventState.APPROVED.value,
559-
created_at=t3,
560-
)
561-
pr_metrics = pr_service.create_pr_metrics(
562-
pr, [changes_requested_event, approval_event], []
563-
)
564-
assert pr_metrics.rework_time == 3600
565544

545+
events = [
546+
get_pull_request_event(reviewer="test-bot[123]"),
547+
get_pull_request_event(reviewer="deploy bot"),
548+
get_pull_request_event(reviewer="special@bot@chars"),
549+
get_pull_request_event(reviewer="robo"),
550+
get_pull_request_event(reviewer="botanic"),
551+
get_pull_request_event(reviewer="robot"),
552+
get_pull_request_event(reviewer="abot"),
553+
]
566554

567-
def test_create_pr_metrics_no_human_first_response_time():
568-
pr_service = CodeETLAnalyticsService()
569-
t1 = time_now()
570-
t2 = t1 + timedelta(hours=1)
571-
pr = get_pull_request(state=PullRequestState.MERGED, created_at=t1, updated_at=t1)
572-
first_review_event = get_pull_request_event(
573-
pull_request_id=pr.id,
574-
reviewer="test-bot[bot]",
575-
state=PullRequestEventState.COMMENTED.value,
576-
created_at=t2,
577-
)
578-
pr_metrics = pr_service.create_pr_metrics(pr, [first_review_event], [])
579-
assert pr_metrics.first_response_time is None
555+
filtered_events = pr_service.filter_non_bot_events(events)
580556

557+
expected_remaining = ["robo", "botanic", "robot", "abot"]
558+
filtered_usernames = [e.actor_username for e in filtered_events]
581559

582-
def test_create_pr_metrics_no_human_rework_time():
583-
pr_service = CodeETLAnalyticsService()
584-
t1 = time_now()
585-
t2 = t1 + timedelta(hours=1)
586-
t3 = t2 + timedelta(hours=1)
587-
pr = get_pull_request(state=PullRequestState.MERGED, created_at=t1, updated_at=t1)
588-
changes_requested_event = get_pull_request_event(
589-
pull_request_id=pr.id,
590-
reviewer="test-bot[bot]",
591-
state=PullRequestEventState.CHANGES_REQUESTED.value,
592-
created_at=t2,
593-
)
594-
approval_event = get_pull_request_event(
595-
pull_request_id=pr.id,
596-
reviewer="test-bot[bot]",
597-
state=PullRequestEventState.APPROVED.value,
598-
created_at=t3,
599-
)
600-
pr_metrics = pr_service.create_pr_metrics(
601-
pr, [changes_requested_event, approval_event], []
602-
)
603-
assert pr_metrics.rework_time is None
560+
assert len(filtered_events) == 4
561+
for username in expected_remaining:
562+
assert username in filtered_usernames
604563

605564

606-
def test_create_pr_metrics_filters_bot_type_events():
565+
def test_create_pr_metrics_bot_detection_in_review_events():
607566
pr_service = CodeETLAnalyticsService()
608567
t1 = time_now()
609-
t2 = t1 + timedelta(hours=1)
610-
t3 = t2 + timedelta(hours=1)
611568
pr = get_pull_request(state=PullRequestState.MERGED, created_at=t1, updated_at=t1)
612569

613-
bot_event = get_pull_request_event(
614-
pull_request_id=pr.id,
615-
reviewer="github_app",
616-
state=PullRequestEventState.COMMENTED.value,
617-
created_at=t2,
618-
data={"user": {"type": "Bot"}},
619-
)
570+
bot_reviewers = [
571+
"github-actions[bot]",
572+
"dependabot-preview[bot]",
573+
"Jenkins_Bot",
574+
"ci_bot_service",
575+
"bot_reviewer",
576+
"tool-bot-123",
577+
"_bot_helper",
578+
"helper_bot",
579+
]
580+
581+
bot_events = []
582+
for i, reviewer in enumerate(bot_reviewers):
583+
bot_events.append(
584+
get_pull_request_event(
585+
pull_request_id=pr.id,
586+
reviewer=reviewer,
587+
state=PullRequestEventState.COMMENTED.value,
588+
created_at=t1 + timedelta(minutes=i + 1),
589+
)
590+
)
620591

621592
human_event = get_pull_request_event(
622593
pull_request_id=pr.id,
623-
reviewer="human_user",
594+
reviewer="human_reviewer",
624595
state=PullRequestEventState.APPROVED.value,
625-
created_at=t3,
596+
created_at=t1 + timedelta(hours=1),
626597
)
627598

628-
pr_metrics = pr_service.create_pr_metrics(pr, [bot_event, human_event], [])
629-
assert "human_user" in pr_metrics.reviewers
630-
assert "github_app" not in pr_metrics.reviewers
599+
all_events = bot_events + [human_event]
600+
601+
pr_metrics = pr_service.create_pr_metrics(pr, all_events, [])
602+
603+
assert len(pr_metrics.reviewers) == 1
604+
assert "human_reviewer" in pr_metrics.reviewers
605+
606+
for bot_reviewer in bot_reviewers:
607+
assert bot_reviewer not in pr_metrics.reviewers

backend/analytics_server/tests/service/code/sync/test_etl_github_handler.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -351,3 +351,32 @@ def test__dt_from_github_dt_string_given_date_string_returns_correct_datetime():
351351
date_string = "2024-04-18T10:53:15Z"
352352
expected = datetime(2024, 4, 18, 10, 53, 15, tzinfo=pytz.UTC)
353353
assert GithubETLHandler._dt_from_github_dt_string(date_string) == expected
354+
355+
356+
def test__github_bot_filter_given_bot_events_returns_empty_list():
357+
github_etl_handler = GithubETLHandler(ORG_ID, None, None, None, None)
358+
bot_event = get_pull_request_event()
359+
bot_event.data = {"user": {"type": "Bot", "login": "dependabot"}}
360+
result = github_etl_handler._github_bot_filter([bot_event])
361+
assert len(result) == 0
362+
363+
364+
def test__github_bot_filter_given_non_bot_events_returns_same_list():
365+
# Arrange
366+
github_etl_handler = GithubETLHandler(ORG_ID, None, None, None, None)
367+
human_event = get_pull_request_event()
368+
human_event.data = {"user": {"type": "User", "login": "john_doe"}}
369+
result = github_etl_handler._github_bot_filter([human_event])
370+
assert len(result) == 1
371+
assert result[0] == human_event
372+
373+
374+
def test__github_bot_filter_given_mixed_events_returns_non_bot_events():
375+
github_etl_handler = GithubETLHandler(ORG_ID, None, None, None, None)
376+
bot_event = get_pull_request_event()
377+
bot_event.data = {"actor": {"type": "Bot", "login": "dependabot"}}
378+
human_event = get_pull_request_event()
379+
human_event.data = {"user": {"type": "User", "login": "john_doe"}}
380+
result = github_etl_handler._github_bot_filter([bot_event, human_event])
381+
assert len(result) == 1
382+
assert result[0] == human_event
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
2+
3+
from mhq.utils.string import is_bot_name
4+
5+
def test_simple_bot_names():
6+
assert is_bot_name("test_bot")
7+
assert is_bot_name("test-bot")
8+
9+
10+
def test_bot_with_prefixes_and_suffixes():
11+
assert is_bot_name("my_bot")
12+
assert is_bot_name("my-bot")
13+
assert is_bot_name("my bot")
14+
assert is_bot_name("test_bot_123")
15+
assert is_bot_name("test-bot-123")
16+
assert is_bot_name("test bot 123")
17+
18+
19+
def test_special_patterns():
20+
assert is_bot_name("name_bot_suffix")
21+
assert is_bot_name("name_bot")
22+
assert is_bot_name("bot_name")
23+
assert is_bot_name("my_bot_is_cool")
24+
25+
26+
def test_case_insensitivity():
27+
assert is_bot_name("my_BOT")
28+
assert is_bot_name("MY-bot")
29+
assert is_bot_name("My Bot")
30+
31+
32+
def test_special_characters():
33+
assert is_bot_name("test@bot")
34+
assert is_bot_name("[bot]")
35+
36+
def test_negative_cases():
37+
assert not is_bot_name("robotics")
38+
assert not is_bot_name("lobotomy")
39+
assert not is_bot_name("botany")
40+
assert not is_bot_name("about")
41+
assert not is_bot_name("robotic")
42+
assert not is_bot_name("bots")
43+
44+
45+
def test_edge_cases():
46+
assert not is_bot_name("")
47+
assert not is_bot_name(" ")
48+
assert not is_bot_name("12345")

0 commit comments

Comments
 (0)