diff --git a/DEPLOYMENT.md b/DEPLOYMENT.md index ef6651b..1239f22 100644 --- a/DEPLOYMENT.md +++ b/DEPLOYMENT.md @@ -49,6 +49,8 @@ docker push equalitie/baskervillehall:dev docker buildx build --platform linux/amd64 -f ./Dockerfile.session-simple . -t equalitie/baskervillehall:session2 docker push equalitie/baskervillehall:session2 +docker buildx build --platform linux/amd64 -f ./Dockerfile.session-simple . -t equalitie/baskervillehall:session_dev +docker push equalitie/baskervillehall:session_dev ``` ### Building predict image @@ -115,6 +117,15 @@ kubectl apply -f deployment/postgres/postgres-baskervillehall-service.yaml kubectl port-forward service/postgres-baskervillehall 5433:5432 +### Postgres deployment dev +kubectl apply -f deployment/postgres_dev/postgres-baskervillehall-dev-secret.yaml +kubectl apply -f deployment/postgres_dev/postgres-baskervillehall-dev-pv.yaml +kubectl apply -f deployment/postgres_dev/postgres-baskervillehall-dev-pvc.yaml +kubectl apply -f deployment/postgres_dev/postgres-baskervillehall-dev.yaml +kubectl apply -f deployment/postgres_dev/postgres-baskervillehall-dev-service.yaml + +kubectl port-forward service/postgres-baskervillehall-dev 5433:5432 + ### Monitoring ```commandline diff --git a/config_baskervillehall_ch.yaml b/config_baskervillehall_ch.yaml index d2de7b6..b8b2699 100644 --- a/config_baskervillehall_ch.yaml +++ b/config_baskervillehall_ch.yaml @@ -53,7 +53,7 @@ data: NUM_OFFENCES_FOR_DIFFICULT_CHALLENGE: "5" MAXSIZE_PENDING: "10000000" USE_SHAPLEY: "True" - POSTGRES_HOST: "postgres-database-service.baskerville-server-database-production" + POSTGRES_HOST: "postgres-baskervillehall-dev.default" POSTGRES_PORT: "5432" POSTGRES_REFRESH_PERIOD_IN_SECONDS: "180" SENSITIVITY_FACTOR: "0.05" @@ -61,7 +61,7 @@ data: NUM_REQUESTS_IN_STORAGE: "20" SQL_TABLE_SESSIONS: "sessions" SQL_TABLE_COMMANDS: "challenge_command_history" - AUTOCREATE_HOSTNAME_ID: "False" + AUTOCREATE_HOSTNAME_ID: "True" MAX_SESSIONS_FOR_IP: "10" BOT_SCORE_THRESHOLD: "0.5" BAD_BOT_CHALLENGE: "True" diff --git a/deployment/postgres/create_schema.sql b/deployment/postgres/create_schema.sql index bc28967..2aec4e0 100644 --- a/deployment/postgres/create_schema.sql +++ b/deployment/postgres/create_schema.sql @@ -61,7 +61,7 @@ CREATE TABLE public.sessions ( ALTER TABLE public.sessions ADD CONSTRAINT sessions_hostname_id_fkey FOREIGN KEY (hostname_id) REFERENCES public.hostname(hostname_id) ON DELETE CASCADE; CREATE INDEX sessions_index ON sessions (session_end, host_name); -DROP TABLE public.challenge_command_history; +-- DROP TABLE public.challenge_command_history; CREATE TABLE public.challenge_command_history ( challenge_command_id uuid DEFAULT uuid_generate_v4() NOT NULL, diff --git a/deployment/postgres/dashboard.sql b/deployment/postgres/dashboard.sql new file mode 100644 index 0000000..b95270a --- /dev/null +++ b/deployment/postgres/dashboard.sql @@ -0,0 +1,84 @@ +WITH hours AS ( + SELECT generate_series( + date_trunc('hour', now() AT TIME ZONE 'UTC') - interval '23 hours', + date_trunc('hour', now() AT TIME ZONE 'UTC'), + interval '1 hour' + ) AS bucket_hour +) +SELECT + h.bucket_hour AS "time", -- UTC + COALESCE(c.challenged_ips, 0) AS challenged, + COALESCE(p.passed_ips, 0) AS passed +FROM hours h +LEFT JOIN public.dashboard_challenged_1h c + ON c.bucket_hour = h.bucket_hour +LEFT JOIN public.dashboard_passed_1h p + ON p.bucket_hour = h.bucket_hour +ORDER BY h.bucket_hour; + +-- precision timeseries 24h +WITH hours AS ( + SELECT generate_series( + date_trunc('hour', now() AT TIME ZONE 'UTC') - interval '23 hours', + date_trunc('hour', now() AT TIME ZONE 'UTC'), + interval '1 hour' + ) AS bucket_hour +) +SELECT + h.bucket_hour AS "time", -- UTC + COALESCE(p.precision_pct, 0)::DOUBLE PRECISION AS precision_pct +FROM hours h +LEFT JOIN public.dashboard_precision_1h p + ON p.bucket_hour = h.bucket_hour +ORDER BY h.bucket_hour; + + + +-- average precision 24h +SELECT + ROUND( + SUM(precision_pct * total_ips) / NULLIF(SUM(total_ips), 0), + 1 + )::DOUBLE PRECISION AS weighted_avg_precision_24h +FROM public.dashboard_precision_1h +WHERE + bucket_hour >= date_trunc('hour', now() AT TIME ZONE 'UTC') - interval '23 hours'; + + + +-- human vs automated last 24h +WITH hours AS ( + SELECT generate_series( + date_trunc('hour', now() AT TIME ZONE 'UTC') - interval '23 hours', + date_trunc('hour', now() AT TIME ZONE 'UTC'), + interval '1 hour' + ) AS bucket_hour +), +agg AS ( + SELECT + bucket_hour, + COALESCE(SUM(cnt) FILTER (WHERE human_label = 'human'), 0) AS human, + COALESCE(SUM(cnt) FILTER (WHERE human_label = 'bot'), 0) AS bot + FROM public.dashboard_human_bot_1h + WHERE + bucket_hour >= date_trunc('hour', now() AT TIME ZONE 'UTC') - interval '23 hours' + GROUP BY bucket_hour +) +SELECT + h.bucket_hour AS "time", -- UTC + COALESCE(a.human, 0) AS human, + COALESCE(a.bot, 0) AS bot, + ROUND( + 100.0 * COALESCE(a.human, 0) + / NULLIF(COALESCE(a.human, 0) + COALESCE(a.bot, 0), 0), + 1 + )::DOUBLE PRECISION AS human_percenatage, + ROUND( + 100.0 * COALESCE(a.bot, 0) + / NULLIF(COALESCE(a.human, 0) + COALESCE(a.bot, 0), 0), + 1 + )::DOUBLE PRECISION AS bot_percentage +FROM hours h +LEFT JOIN agg a + ON a.bucket_hour = h.bucket_hour +ORDER BY h.bucket_hour; diff --git a/deployment/postgres/postgres-baskervillehall-secret.yaml b/deployment/postgres/postgres-baskervillehall-secret.yaml index b406047..0826bd5 100644 --- a/deployment/postgres/postgres-baskervillehall-secret.yaml +++ b/deployment/postgres/postgres-baskervillehall-secret.yaml @@ -5,4 +5,4 @@ metadata: type: Opaque stringData: user: postgres - password: zJmh93FfhMvX6tDRNEoLuu97 \ No newline at end of file + password: \ No newline at end of file diff --git a/deployment/postgres/postgres-lb.yaml b/deployment/postgres/postgres-lb.yaml index 39427f8..10c47dd 100644 --- a/deployment/postgres/postgres-lb.yaml +++ b/deployment/postgres/postgres-lb.yaml @@ -15,6 +15,7 @@ spec: - 134.122.32.231/32 - 147.182.146.235/32 - 212.105.155.18/32 + - 91.65.62.64/32 selector: app: postgres-baskervillehall diff --git a/deployment/postgres_dev/create_schema.sql b/deployment/postgres_dev/create_schema.sql new file mode 100644 index 0000000..02c36a2 --- /dev/null +++ b/deployment/postgres_dev/create_schema.sql @@ -0,0 +1,159 @@ + +CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; + +-- DROP TABLE public.hostname; + +CREATE TABLE public.hostname ( + hostname_id uuid DEFAULT uuid_generate_v4() NOT NULL, + hostname text NOT NULL, + created_at timestamp NOT NULL, + updated_at timestamp NOT NULL, + updated_by text NOT NULL, + CONSTRAINT hostname_hostname_key UNIQUE (hostname), + CONSTRAINT hostname_pkey PRIMARY KEY (hostname_id) +); +CREATE INDEX hostname_index ON public.hostname USING btree (hostname); + +-- DROP TABLE public.sessions; + +CREATE TABLE public.sessions ( + session_id uuid DEFAULT uuid_generate_v4() NOT NULL, + hostname_id uuid NOT NULL, + host_name text NOT NULL, + ip text NOT NULL, + session_cookie text NOT NULL, + ip_cookie text NOT NULL, + primary_session int4 DEFAULT 0 NULL, + human int4 DEFAULT 0 NULL, + + ua_score float8 DEFAULT 0 NULL, + verified_bot int4 DEFAULT 0 NULL, + num_languages int4 DEFAULT 0 NULL, + valid_browser_ciphers int4 DEFAULT 0 NULL, + cipher text, + ciphers text, + asn text, + asn_name text, + is_scraper int4 DEFAULT 0 NULL, + + vpn int4 DEFAULT 0 NULL, + class text, + passed_challenge int4 DEFAULT 0 NULL, + fingerprints text NULL, + user_agent text NULL, + country text NULL, + continent text NULL, + datacenter text NULL, + hits int4 DEFAULT 0 NOT NULL, + hit_rate int4 DEFAULT 0 NULL, + num_user_agent int4 DEFAULT 1 NULL, + duration float8 DEFAULT 0 NOT NULL, + session_start timestamp NOT NULL, + session_end timestamp NOT NULL, + requests text NULL, + bot_score float8 DEFAULT -1.0 NOT NULL, + bot_score_top_factor text NULL, + + scraper_name text, + created_at timestamp DEFAULT CURRENT_TIMESTAMP NOT NULL, + CONSTRAINT sessions_key PRIMARY KEY (session_id) +); +ALTER TABLE public.sessions ADD CONSTRAINT sessions_hostname_id_fkey FOREIGN KEY (hostname_id) REFERENCES public.hostname(hostname_id) ON DELETE CASCADE; +CREATE INDEX sessions_index ON sessions (session_end, host_name); + +-- DROP TABLE public.challenge_command_history; + +CREATE TABLE public.challenge_command_history ( + challenge_command_id uuid DEFAULT uuid_generate_v4() NOT NULL, + hostname_id uuid NOT NULL, + command_type_name text DEFAULT ''::text NOT NULL, + ip_address inet NOT NULL, + session_cookie text DEFAULT ''::text NOT NULL, + "source" text DEFAULT ''::text NOT NULL, + created_at timestamp DEFAULT CURRENT_TIMESTAMP NOT NULL, + updated_at timestamp DEFAULT CURRENT_TIMESTAMP NOT NULL, + updated_by text NOT NULL, + duration float8 DEFAULT 0.0 NOT NULL, + request_count int4 DEFAULT 0 NOT NULL, + + + + host_name text NOT NULL, + ip_cookie text NOT NULL, + primary_session int4 DEFAULT 0 NULL, + human int4 DEFAULT 0 NULL, + passed_challenge int4 DEFAULT 0 NULL, + bot_score float8 DEFAULT -1.0 NOT NULL, + bot_score_top_factor text NULL, + user_agent text NULL, + country text NULL, + continent text NULL, + datacenter text NULL, + shapley_if text NULL, + shapley_feature_if text NULL, + shapley_ae text NULL, + shapley_feature_ae text NULL, + difficulty int4 DEFAULT 0 NULL, + hits int4 DEFAULT 0 NOT NULL, + hit_rate int4 DEFAULT 0 NULL, + num_user_agent int4 DEFAULT 1 NULL, + session_start timestamp NOT NULL, + session_end timestamp NOT NULL, + requests text NULL, + meta text DEFAULT ''::text NOT NULL, + score_if float8 DEFAULT 0.0 NOT NULL, + score_ae float8 DEFAULT 0.0 NOT NULL, + threshold_ae float8 DEFAULT 0.0 NOT NULL, + scraper_name text, + prediction_if int4 DEFAULT 0 NULL, + prediction_ae int4 DEFAULT 0 NULL, + cloudflare_score int4 DEFAULT 0 NULL, + baskerville_score int4 DEFAULT 0 NULL, + + baskerville_score_1 int4 DEFAULT 0 NULL, + baskerville_score_2 int4 DEFAULT 0 NULL, + baskerville_score_3 int4 DEFAULT 0 NULL, + baskerville_score_4 int4 DEFAULT 0 NULL, + CONSTRAINT challenge_command_history_pkey PRIMARY KEY (challenge_command_id) +); +CREATE INDEX idx_hostname_command_type_to_command_history ON public.challenge_command_history USING btree (hostname_id, command_type_name); +CREATE INDEX commands_index ON challenge_command_history (session_end, host_name); + +CREATE INDEX idx_challenge_ip_created + ON challenge_command_history (ip_address, created_at); + +-- public.challenge_command_history foreign keys + +ALTER TABLE public.challenge_command_history ADD CONSTRAINT challenge_command_history_hostname_id_fkey FOREIGN KEY (hostname_id) REFERENCES public.hostname(hostname_id) ON DELETE CASCADE; + + + +CREATE TABLE IF NOT EXISTS public.dashboard_challenged_1h ( + bucket_hour timestamptz PRIMARY KEY, + challenged_ips bigint NOT NULL, + updated_at timestamptz NOT NULL DEFAULT now() +); + +CREATE TABLE IF NOT EXISTS public.dashboard_passed_1h ( + bucket_hour timestamptz PRIMARY KEY, + passed_ips bigint NOT NULL, + updated_at timestamptz NOT NULL DEFAULT now() +); + +CREATE TABLE IF NOT EXISTS public.dashboard_precision_1h ( + bucket_hour timestamptz PRIMARY KEY, + total_ips bigint NOT NULL, + passed_ips bigint NOT NULL, + precision_pct numeric(5,1) NOT NULL, + updated_at timestamptz NOT NULL DEFAULT now() +); + +CREATE TABLE IF NOT EXISTS public.dashboard_human_bot_1h ( + bucket_hour timestamptz NOT NULL, + human_label text NOT NULL, + cnt bigint NOT NULL, + updated_at timestamptz NOT NULL DEFAULT now(), + PRIMARY KEY (bucket_hour, human_label) +); + + diff --git a/deployment/postgres_dev/postgres-baskervillehall-dev-pv.yaml b/deployment/postgres_dev/postgres-baskervillehall-dev-pv.yaml new file mode 100644 index 0000000..af64372 --- /dev/null +++ b/deployment/postgres_dev/postgres-baskervillehall-dev-pv.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: PersistentVolume +metadata: + name: postgres-baskervillehall-dev-pv + labels: + type: local +spec: + storageClassName: csi-expandable + capacity: + storage: 30Gi + accessModes: + - ReadWriteOnce + hostPath: + path: "/mnt/data" \ No newline at end of file diff --git a/deployment/postgres_dev/postgres-baskervillehall-dev-pvc.yaml b/deployment/postgres_dev/postgres-baskervillehall-dev-pvc.yaml new file mode 100644 index 0000000..972eff9 --- /dev/null +++ b/deployment/postgres_dev/postgres-baskervillehall-dev-pvc.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: postgres-baskervillehall-dev-pvc +spec: + storageClassName: retain-storage + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 30Gi diff --git a/deployment/postgres_dev/postgres-baskervillehall-dev-secret.yaml b/deployment/postgres_dev/postgres-baskervillehall-dev-secret.yaml new file mode 100644 index 0000000..693d44f --- /dev/null +++ b/deployment/postgres_dev/postgres-baskervillehall-dev-secret.yaml @@ -0,0 +1,9 @@ +apiVersion: v1 +kind: Secret +metadata: + name: postgres-baskervillehall-dev-secret +type: Opaque +stringData: + user: postgres + password: zJmh93FfhMvX6tDRNEoLuu97 + database: baskerville \ No newline at end of file diff --git a/deployment/postgres_dev/postgres-baskervillehall-dev-service.yaml b/deployment/postgres_dev/postgres-baskervillehall-dev-service.yaml new file mode 100644 index 0000000..25ca82d --- /dev/null +++ b/deployment/postgres_dev/postgres-baskervillehall-dev-service.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: Service +metadata: + name: postgres-baskervillehall-dev + labels: + app: postgres-baskervillehall-dev +spec: + selector: + app: postgres-baskervillehall-dev + type: ClusterIP + ports: + - port: 5432 + targetPort: 5432 \ No newline at end of file diff --git a/deployment/postgres_dev/postgres-baskervillehall-dev.yaml b/deployment/postgres_dev/postgres-baskervillehall-dev.yaml new file mode 100644 index 0000000..c7219d7 --- /dev/null +++ b/deployment/postgres_dev/postgres-baskervillehall-dev.yaml @@ -0,0 +1,59 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: postgres-baskervillehall-dev +spec: + replicas: 1 + strategy: + type: Recreate + selector: + matchLabels: + app: postgres-baskervillehall-dev + template: + metadata: + labels: + app: postgres-baskervillehall-dev + spec: + initContainers: + - name: prestart-clean-shm + image: busybox:1.36 + command: ["sh","-c","rm -f /dev/shm/PostgreSQL.* || true"] + volumeMounts: + - mountPath: /dev/shm + name: dshm + volumes: + - name: postgres-pv-storage + persistentVolumeClaim: + claimName: postgres-baskervillehall-dev-pvc + - name: dshm + emptyDir: + medium: Memory + sizeLimit: 1Gi + containers: + - name: postgres + image: postgres:11 + imagePullPolicy: IfNotPresent + ports: + - containerPort: 5432 + env: + - name: POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: postgres-baskervillehall-dev-secret + key: password + - name: PGDATA + value: /var/lib/postgresql/data/pgdata + volumeMounts: + - mountPath: /var/lib/postgresql/data + name: postgres-pv-storage + - mountPath: /dev/shm + name: dshm + readinessProbe: + exec: { command: ["pg_isready","-U","postgres"] } + initialDelaySeconds: 10 + periodSeconds: 5 + livenessProbe: + exec: { command: ["pg_isready","-U","postgres"] } + initialDelaySeconds: 30 + periodSeconds: 10 + diff --git a/session_deployment_ch.yaml b/session_deployment_ch.yaml index 91d20fc..e0fb2db 100644 --- a/session_deployment_ch.yaml +++ b/session_deployment_ch.yaml @@ -21,7 +21,7 @@ spec: spec: containers: - name: "baskervillehall-session-ch" - image: "equalitie/baskervillehall:session2" + image: "equalitie/baskervillehall:session_dev" imagePullPolicy: Always command: - python diff --git a/smart_bot.py b/smart_bot.py index 9216599..42435cc 100755 --- a/smart_bot.py +++ b/smart_bot.py @@ -216,6 +216,23 @@ def run(self): ignore_https_errors=False, ) + # Block resources at context level (more aggressive) + resource_counts = {} + def block_resources_context(route): + rtype = route.request.resource_type + resource_counts[rtype] = resource_counts.get(rtype, 0) + 1 + + if rtype in ["image", "stylesheet", "font", "media", "script"]: + route.abort() + else: + route.continue_() + + if self.mode in ['extreme-cv', 'low-entropy', 'combined', 'high-consistency', 'high-consistency-file']: + context.route("**/*", block_resources_context) + print(f"\n[BLOCKING MODE] Images, CSS, JS, fonts, media will be blocked") + print(f" Only HTML requests will reach the server") + print(f" Expected: ~1 request per page load (instead of 10-20)") + # Stealth settings context.add_init_script(""" Object.defineProperty(navigator, 'webdriver', { @@ -225,18 +242,6 @@ def run(self): page = context.new_page() - # Block images, CSS, JS, fonts to reduce entropy (only load HTML) - # This makes bot behavior more obvious - def block_resources(route): - if route.request.resource_type in ["image", "stylesheet", "font", "media"]: - route.abort() - else: - route.continue_() - - # Enable resource blocking for modes that need low entropy - if self.mode in ['extreme-cv', 'low-entropy', 'combined', 'high-consistency', 'high-consistency-file']: - page.route("**/*", block_resources) - start_time = time.time() session_cookie = None @@ -326,6 +331,18 @@ def block_resources(route): else: print(f"\n⚠️ No session cookie detected - requests may be treated as separate sessions!") + # Show resource blocking stats + if self.mode in ['extreme-cv', 'low-entropy', 'combined', 'high-consistency', 'high-consistency-file']: + print(f"\nResource blocking stats:") + total_blocked = sum(count for rtype, count in resource_counts.items() + if rtype in ["image", "stylesheet", "font", "media", "script"]) + total_allowed = sum(count for rtype, count in resource_counts.items() + if rtype not in ["image", "stylesheet", "font", "media", "script"]) + for rtype, count in sorted(resource_counts.items()): + status = "BLOCKED" if rtype in ["image", "stylesheet", "font", "media", "script"] else "ALLOWED" + print(f" {rtype:15s}: {count:4d} {status}") + print(f" Total blocked: {total_blocked}, Total allowed: {total_allowed}") + print(f"\nExpected Baskerville features:") print(f" entropy: ~{len(set(urls)).bit_length():.1f} (LOW - repeating URLs)") print(f" request_rate: {rate:.1f}") diff --git a/src/baskervillehall/baskerville_rules.py b/src/baskervillehall/baskerville_rules.py index 8b49c32..99ad3b4 100644 --- a/src/baskervillehall/baskerville_rules.py +++ b/src/baskervillehall/baskerville_rules.py @@ -486,6 +486,17 @@ def path_suspicion_score(url: str, status_code: int) -> float: u = url.lower() score = 0.0 + # Check if this is a static resource (CSS, JS, images, fonts, etc.) + static_extensions = ('.css', '.js', '.jpg', '.jpeg', '.png', '.gif', '.svg', '.webp', + '.woff', '.woff2', '.ttf', '.eot', '.ico', '.mp4', '.webm', '.pdf') + is_static = any(u.endswith(ext) for ext in static_extensions) + + # wp-content and wp-includes are suspicious ONLY if: + # 1. They are .php files (exploit attempts) + # 2. They return 404 (scanning for vulnerabilities) + # 3. They are NOT static resources (legitimate site assets) + is_wp_path = ('/wp-content/' in u or '/wp-includes/' in u) + # Any .php request outside normal app flow is suspicious if ".php" in u: score += 0.3 @@ -493,6 +504,9 @@ def path_suspicion_score(url: str, status_code: int) -> float: # Explicit known exploit/scan patterns for pat in compiled_path_patterns: if pat.search(u): + # Don't penalize static resources from wp-content/wp-includes + if is_wp_path and is_static: + continue score += 0.5 break @@ -575,12 +589,15 @@ def get_baskerville_score_1(session: dict) -> int: lang_susp = 1.0 if num_languages == 0 else 0.0 # Weighted combination into bot suspicion raw score (0..1) - # Path is most informative for scanners, then UA, then cipher + language. + # UA is most informative (curl, wget, python-requests), then language, path, cipher. + # Increased ua_susp weight from 0.2 to 0.45 (45%) to better detect command-line tools + # Increased lang_susp weight from 0.15 to 0.2 (20%) - missing Accept-Language is strong signal + # Decreased path_susp weight from 0.5 to 0.2 (20%) - not all bots scan suspicious paths bot_susp_raw = ( - 0.5 * path_susp + - 0.2 * ua_susp + - 0.15 * cipher_susp + - 0.15 * lang_susp + 0.2 * path_susp + # 20% - path patterns (scanners) + 0.45 * ua_susp + # 45% - User-Agent (strongest signal for curl/wget/requests) + 0.15 * cipher_susp + # 15% - cipher type + 0.2 * lang_susp # 20% - Accept-Language presence ) # Convert into baskerville human-like score (1..99) diff --git a/src/baskervillehall/baskervillehall_predictor.py b/src/baskervillehall/baskervillehall_predictor.py index 62cd08b..e4fd9a8 100644 --- a/src/baskervillehall/baskervillehall_predictor.py +++ b/src/baskervillehall/baskervillehall_predictor.py @@ -327,6 +327,10 @@ def create_command( "rate_limit_expiration": rate_limit_expiration, "baskerville_score": int(baskerville_score), "cloudflare_score": session.get("cloudflare_score", 0), + "baskerville_score_1": session.get("baskerville_score_1", 0), + "baskerville_score_2": session.get("baskerville_score_2", 0), + "baskerville_score_3": session.get("baskerville_score_3", 0), + "baskerville_score_4": baskerville_score, } return d @@ -408,7 +412,14 @@ def _process_sessions_batch(self, host: str, human: bool, sessions: List[Dict], self.logger.info(f" Accept-Language: {sessions[i].get('accept_language', 'N/A')}") self.logger.info(f" num_languages: {sessions[i].get('num_languages', 'N/A')}") self.logger.info(f" cipher_type: {sessions[i].get('cipher_type', 'N/A')}") - self.logger.info(f"Baskerville score: {scores_classifier[i]}, Baskerville_1: {sessions[i].get('baskerville_score_1', 'N/A')}, Cloudflare score: {sessions[i].get('cloudflare_score', 0)}, bot: {predictions_classifier[i]}") + self.logger.info( + f"Baskerville score_4: {scores_classifier[i]}, " + f"score_1: {sessions[i].get('baskerville_score_1', 'N/A')}, " + f"score_2: {sessions[i].get('baskerville_score_2', 'N/A')}, " + f"score_3: {sessions[i].get('baskerville_score_3', 'N/A')}, " + f"Cloudflare: {sessions[i].get('cloudflare_score', 0)}, " + f"bot: {predictions_classifier[i]}" + ) # Log ALL features actually used by the model if features_df is not None and i < len(features_df): @@ -459,6 +470,13 @@ def _process_sessions_batch(self, host: str, human: bool, sessions: List[Dict], self.logger.info(f" {j+1}. {item['name']:35s} +{item['shapley']:6.3f}") self.logger.info(f"{'='*80}\n") + else: + self.logger.warning( + f"Baskerville classifier model not found for host={host}, " + f"baskerville_score_4 will be 0. Check if classifier model is trained and loaded from S3." + ) + else: + self.logger.debug("use_baskerville_score=False, skipping baskerville_score_4 computation") results = [] for i, session in enumerate(sessions): @@ -512,13 +530,11 @@ def _process_sessions_batch(self, host: str, human: bool, sessions: List[Dict], if 'entropy' in model_if.get_all_features(): entropy = float(vectors_if.iloc[i]['entropy']) - if human: - if scores_classifier is not None: - baskerville_score = int(scores_classifier[i]) - else: - baskerville_score = session.get('human_score', 99) + if scores_classifier is not None: + baskerville_score = int(scores_classifier[i]) else: - baskerville_score = session.get('human_score', 1) + baskerville_score = 0 + results.append( { "host": host, @@ -703,7 +719,10 @@ def _apply_decision_and_send( self.logger.info( f"Classifier {command} for ip={ip}, " f"human={human}, command={command}, session_id={session_id}, host={host}, " - f"baskerville_score={baskerville_score} " + f"baskerville_score_4={baskerville_score}, " + f"score_1={session.get('baskerville_score_1', 'N/A')}, " + f"score_2={session.get('baskerville_score_2', 'N/A')}, " + f"score_3={session.get('baskerville_score_3', 'N/A')}, " f"cloudflare_score={session.get('cloudflare_score', 0)}." ) payload = self.create_command( @@ -747,8 +766,11 @@ def _apply_decision_and_send( self.logger.info( f"High bot score - {command} for ip={ip}, " f"human={human}, command={command}, session_id={session_id}, host={host}, " - f"top_factor = {bot_score_top_factor} " - f"baskerville_score={baskerville_score} " + f"top_factor={bot_score_top_factor}, " + f"baskerville_score_4={baskerville_score}, " + f"score_1={session.get('baskerville_score_1', 'N/A')}, " + f"score_2={session.get('baskerville_score_2', 'N/A')}, " + f"score_3={session.get('baskerville_score_3', 'N/A')}, " f"cloudflare_score={session.get('cloudflare_score', 0)}." ) payload = self.create_command( @@ -783,11 +805,16 @@ def _apply_decision_and_send( else: command = "rate_limit" if self.use_rate_limit else "challenge_ip" baskerville_score = 1 - self.logger.info(f"Challenge ip (bad_bot)," - f"Baskerville score {baskerville_score}, " - f"Cloudflare score {session.get('cloudflare_score', 0)}, " - f"ip = {ip} host={host} " - f"ua={session.get('ua')} end={session.get('end')}") + self.logger.info( + f"Challenge ip (bad_bot), " + f"ip={ip}, host={host}, " + f"baskerville_score_3={baskerville_score}, " + f"score_1={session.get('baskerville_score_1', 'N/A')}, " + f"score_2={session.get('baskerville_score_2', 'N/A')}, " + f"cloudflare_score={session.get('cloudflare_score', 0)}, " + f"ua={session.get('ua')}, " + f"end={session.get('end')}" + ) payload = self.create_command( command_name=command, @@ -828,9 +855,13 @@ def _apply_decision_and_send( self.logger.info( f"meta {meta} - {command} for ip={ip}, " f"human={human}, command={command}, session_id={session_id}, host={host}, " - f"top_factor = {bot_score_top_factor} ua={session.get('ua')} " - f"baskerville_score={baskerville_score}. " - f"cloudflare_score={session.get('cloudflare_score', 0)} end={session.get('end')}." + f"top_factor={bot_score_top_factor}, ua={session.get('ua')}, " + f"baskerville_score_4={baskerville_score}, " + f"score_1={session.get('baskerville_score_1', 'N/A')}, " + f"score_2={session.get('baskerville_score_2', 'N/A')}, " + f"score_3={session.get('baskerville_score_3', 'N/A')}, " + f"cloudflare_score={session.get('cloudflare_score', 0)}, " + f"end={session.get('end')}." ) payload = self.create_command( command_name=command, @@ -871,10 +902,14 @@ def _apply_decision_and_send( baskerville_score = 25 self.logger.info( f"Too many sessions ({len(host_ip_sessions[host][ip])}) challenge_ip for ip={ip}, " - f"human={human} session_id={session_id}, host={host}, " - f"ua={session.get('ua')} " - f"baskerville_score={baskerville_score}. " - f"cloudflare_score={session.get('cloudflare_score', 0)} end={session.get('end')}." + f"human={human}, session_id={session_id}, host={host}, " + f"ua={session.get('ua')}, " + f"baskerville_score_4={baskerville_score}, " + f"score_1={session.get('baskerville_score_1', 'N/A')}, " + f"score_2={session.get('baskerville_score_2', 'N/A')}, " + f"score_3={session.get('baskerville_score_3', 'N/A')}, " + f"cloudflare_score={session.get('cloudflare_score', 0)}, " + f"end={session.get('end')}." ) payload = self.create_command( @@ -922,9 +957,14 @@ def _apply_decision_and_send( self.logger.info( f"Anomaly {command} for ip={ip}, human={human}, command={command}, " f"session_id={session_id}, host={host}, " - f"score_if={score_if}, score_ae={score_ae}, " - f"baskerville_score={baskerville_score}, baskerville_score_1={session.get('baskerville_score_1', 'N/A')}, " - f"cloudflare_score={session.get('cloudflare_score', 0)} end={session.get('end')}." + f"score_if={score_if}, score_ae={score_ae}, " + f"baskerville_score_5={baskerville_score}, " + f"score_1={session.get('baskerville_score_1', 'N/A')}, " + f"score_2={session.get('baskerville_score_2', 'N/A')}, " + f"score_3={session.get('baskerville_score_3', 'N/A')}, " + f"score_4={session.get('baskerville_score_4', 'N/A')}, " + f"cloudflare_score={session.get('cloudflare_score', 0)}, " + f"end={session.get('end')}." ) payload = self.create_command( command_name=command, @@ -953,8 +993,12 @@ def _apply_decision_and_send( self.logger.info( f"No command for ip={ip}, human={human}, " f"session_id={session_id}, host={host}, " - f"baskerville_score={baskerville_score}, baskerville_score_1={session.get('baskerville_score_1', 'N/A')}, " - f"cloudflare_score={session.get('cloudflare_score', 0)}, end={session.get('end')}." + f"baskerville_score_4={baskerville_score}, " + f"score_1={session.get('baskerville_score_1', 'N/A')}, " + f"score_2={session.get('baskerville_score_2', 'N/A')}, " + f"score_3={session.get('baskerville_score_3', 'N/A')}, " + f"cloudflare_score={session.get('cloudflare_score', 0)}, " + f"end={session.get('end')}." ) payload = self.create_command( command_name='no command', @@ -978,13 +1022,43 @@ def _apply_decision_and_send( ) self.send(producer, None, payload, key=host, dnet=dnet) - def process_immature_session(self, session): - self.logger.info(f"Immature session is_human={is_human(session)}, " - f"len={len(session['requests'])}, " - f"score1 = {session.get('baskerville_score_1', 'N/A')}, " - f"score2 = {session.get('baskerville_score_2', 'N/A')}, " - f"ip={session['ip']}, " - f"session_id={session['session_id']} ") + def process_immature_session(self, session, producer): + + payload = self.create_command( + command_name='no command', + session=session, + meta = 'immature_session', + prediction_if = 0, + score_if = 0, + shapley_if = 0, + shapley_feature_if = '', + prediction_ae = 0, + score_ae = 0, + shapley_ae = 0, + shapley_feature_ae = '', + difficulty = 0, + scraper_name = '', + threshold_ae = 0, + rate_limit_hits = 0, + rate_limit_interval = 0, + rate_limit_expiration = 0, + baskerville_score=0 + ) + self.send(producer=producer, + producer_output=None, + payload=payload, + key=session['host'], + dnet='') + + self.logger.info( + f"Immature session is_human={is_human(session)}, " + f"len={len(session['requests'])}, " + f"score_1={session.get('baskerville_score_1', 'N/A')}, " + f"score_2={session.get('baskerville_score_2', 'N/A')}, " + f"cloudflare_score={session.get('cloudflare_score', 0)}, " + f"ip={session['ip']}, " + f"session_id={session['session_id']}" + ) @@ -1052,6 +1126,7 @@ def run(self): predicting_total = 0 ip_whitelisted = 0 + seen_sessions = {} # Track duplicate sessions in this batch for message in messages: if (datetime.now() - ts_lag_report).total_seconds() > 5: try: @@ -1078,6 +1153,14 @@ def run(self): ip = session["ip"] host = message.key.decode("utf-8") + # Check for duplicate sessions in this batch + session_key = (ip, session.get('session_id', '-'), len(session.get('requests', []))) + if session_key in seen_sessions: + self.logger.warning(f"[DUPLICATE] Session seen before in this batch: " + f"ip={ip}, session_id={session.get('session_id')}, " + f"len={len(session.get('requests', []))}") + seen_sessions[session_key] = True + if whitelist_ip.is_in_whitelist(host, session["ip"]): ip_whitelisted += 1 continue @@ -1089,7 +1172,7 @@ def run(self): continue if session.get('immature_session', False): - self.process_immature_session(session) + self.process_immature_session(session, producer) continue if not session.get("primary_session", False): diff --git a/src/baskervillehall/baskervillehall_session.py b/src/baskervillehall/baskervillehall_session.py index 162e740..428d4b2 100644 --- a/src/baskervillehall/baskervillehall_session.py +++ b/src/baskervillehall/baskervillehall_session.py @@ -540,6 +540,10 @@ def send_session(self, session): """ Send session data in full format. """ + self.logger.info(f"[send_session] Sending session: ip={session['ip']}, " + f"session_id={session['session_id']}, " + f"len={len(session['requests'])}, " + f"immature={session.get('immature_session', False)}") t_fmt = self._t() requests = session['requests'] @@ -630,8 +634,8 @@ def send_session(self, session): 'cloudflare_score': session['cloudflare_score'], 'http_protocol': session.get('http_protocol', ''), 'immature_session': session.get('immature_session', False), - 'baskerville_score_1': session['baskerville_score_1'], - 'baskerville_score_2': session['baskerville_score_2'], + 'baskerville_score_1': session.get('baskerville_score_1', 50), + 'baskerville_score_2': session.get('baskerville_score_2', 50), } if self.current_lag > self.lag_critical_threshold: @@ -656,7 +660,13 @@ def send_session(self, session): session_final['vps_asn'] = vps_asn session_final['vpn'] = self.vpn_detector.is_vpn(session_final['ip']) session_final['tor'] = self.tor_exit_scnaner.is_tor(session_final['ip']) - session_final['baskerville_score_3'] = get_baskerville_score_3(session_final) + + # baskerville_score_3 should only be computed for full sessions, not immature + if session_final.get('immature_session', False): + session_final['baskerville_score_3'] = 50 # neutral score for immature sessions + else: + session_final['baskerville_score_3'] = get_baskerville_score_3(session_final) + session_final['human'] = is_human(session_final) session_final['bad_bot'] = baskerville_rules.is_bad_bot(session_final) @@ -1164,15 +1174,23 @@ def run(self): passed_challenge = len(data.get('cookies', {}).get('challengePassedCookie', '')) > 0 t_build = self._t() + + # Safe conversion for numeric fields (may be empty strings) + reply_length = data.get('reply_length_bytes', 0) + payload = int(reply_length) if reply_length not in ('', None) else 0 + + http_code = data.get('http_response_code', 0) + code = int(http_code) if http_code not in ('', None) else 0 + request = { 'ts': ts_event, 'dnet': dnet, 'url': url, 'ua': ua, 'query': data.get('querystring', ''), - 'code': data.get('http_response_code', 0), + 'code': code, 'type': data.get('content_type', ''), - 'payload': int(data.get('reply_length_bytes', 0)), + 'payload': payload, 'method': data.get('client_request_method', ''), 'edge': data.get('edge', ''), 'static': data.get('loc_in', '') == 'static_file', @@ -1227,6 +1245,7 @@ def run(self): self._acc('msg_session_lookup', t_lookup) if where == 'main': + self.logger.info(f"key={key} in main") if self.is_session_expired(session, ts_event): t_cr = self._t() session = self.create_session(ua, host, country, '', datacenter_code, ip, session_id, @@ -1248,6 +1267,7 @@ def run(self): self._last_sess_where = 'main' elif where == 'primary': + self.logger.info(f"key={key} already in primary") t_upd = self._t() self.update_session(session, request) self._acc('session_update', t_upd) @@ -1271,6 +1291,7 @@ def run(self): self._last_sess_where = 'main' else: + self.logger.info(f"key={key} is a new primary") t_cr = self._t() session = self.create_session(ua, host, country, '', datacenter_code, ip, session_id, verified_bot, ts_event, cipher, ciphers, request, diff --git a/src/baskervillehall/storage_base.py b/src/baskervillehall/storage_base.py index 8a50324..43ac3e9 100644 --- a/src/baskervillehall/storage_base.py +++ b/src/baskervillehall/storage_base.py @@ -51,7 +51,6 @@ def get_host_id(self, host): host not in self.hostname_id or \ (datetime.now() - self.host_id_timestamp).total_seconds() > 60 * 10: self.host_id_timestamp = datetime.now() - conn = None try: conn = psycopg2.connect(**self.postgres_connection) @@ -61,7 +60,6 @@ def get_host_id(self, host): self.hostname_id = dict() for r in cur.fetchall(): self.hostname_id[r[0]] = r[1] - if host not in self.hostname_id: if self.autocreate_hostname_id: sql = f'insert into public.hostname '\ @@ -162,6 +160,7 @@ def run(self): while True: self.delete_old_records() raw_messages = consumer.poll(timeout_ms=1000, max_records=self.batch_size) + self.logger.info(f'Raw messages: {len(raw_messages)}') for topic_partition, messages in raw_messages.items(): records = [] for message in messages: @@ -178,6 +177,8 @@ def run(self): continue records.append(json.loads(message.value.decode("utf-8"))) + self.logger.info(f'Records: {len(records)}') + self.logger.info(self.postgres_connection) conn = None sql = None try: @@ -186,6 +187,7 @@ def run(self): for r in records: sql = self.get_sql(r) + self.logger.info(sql) if sql: cur.execute(sql) conn.commit() diff --git a/src/baskervillehall/storage_commands.py b/src/baskervillehall/storage_commands.py index 9049b22..7b783f2 100644 --- a/src/baskervillehall/storage_commands.py +++ b/src/baskervillehall/storage_commands.py @@ -51,8 +51,13 @@ def get_sql(self, record): cloudflare_score = command.get("cloudflare_score", 0) if cloudflare_score == '': cloudflare_score = 0 - shapley_formatted_if = json.dumps(command['shapley_if']) if len(command['shapley_if']) > 0 else '' - shapley_formatted_ae = json.dumps(command['shapley_ae']) if len(command['shapley_ae']) > 0 else '' + + # Safe handling of shapley values (can be int, dict, list, or None) + shapley_if = command.get('shapley_if', {}) + shapley_formatted_if = json.dumps(shapley_if) if isinstance(shapley_if, (dict, list)) and shapley_if else '' + + shapley_ae = command.get('shapley_ae', {}) + shapley_formatted_ae = json.dumps(shapley_ae) if isinstance(shapley_ae, (dict, list)) and shapley_ae else '' ua = s["ua"].replace("\'", "") return f'insert into {self.table} (\n'\ f'hostname_id, host_name, ip_address, session_cookie, ip_cookie, '\ @@ -60,7 +65,9 @@ def get_sql(self, record): f'datacenter, hits, score_if, score_ae, threshold_ae, bot_score, bot_score_top_factor,' \ f'shapley_feature_if, shapley_feature_ae,difficulty, shapley_if, shapley_ae,request_count, command_type_name, source, \n'\ f'meta, hit_rate, num_user_agent,'\ - f'duration, session_start, session_end, requests,updated_by,scraper_name,prediction_if,prediction_ae,baskerville_score,cloudflare_score)\n'\ + f'duration, session_start, session_end, requests,updated_by,scraper_name,prediction_if,prediction_ae,'\ + f'baskerville_score,baskerville_score_1,baskerville_score_2,baskerville_score_3,baskerville_score_4,'\ + f'cloudflare_score)\n'\ f'values (\'{host_id}\', \'{host}\', \'{s["ip"]}\', \'{command["session_id"]}\',\n'\ f'\'{s["ip"]}_{command["session_id"]}\',{int(s["primary_session"])},\n'\ f'{int(s["human"])},'\ @@ -81,5 +88,9 @@ def get_sql(self, record): f'\'{requests}\', \'pipeline\',\'{s["scraper_name"]}\','\ f'{command["prediction_if"]},{command["prediction_ae"]},'\ f'{command.get("baskerville_score", 0)},'\ + f'{command.get("baskerville_score_1", 0)},'\ + f'{command.get("baskerville_score_2", 0)},'\ + f'{command.get("baskerville_score_3", 0)},'\ + f'{command.get("baskerville_score_4", 0)},'\ f'{cloudflare_score}'\ f');' diff --git a/src/baskervillehall/storage_sessions.py b/src/baskervillehall/storage_sessions.py index 207ed1e..a1d2b07 100644 --- a/src/baskervillehall/storage_sessions.py +++ b/src/baskervillehall/storage_sessions.py @@ -36,6 +36,7 @@ def get_sql(self, record): host = s["host"] host_id = self.get_host_id(host) if len(host_id) == 0: + self.logger.warning("No host id found for session {}".format(record)) return None if record.get('immature_session', False): return None diff --git a/storage_deployment_ch.yaml b/storage_deployment_ch.yaml index 6f00039..1773e45 100644 --- a/storage_deployment_ch.yaml +++ b/storage_deployment_ch.yaml @@ -22,7 +22,7 @@ spec: restartPolicy: Always containers: - name: "baskervillehall-storage-ch" - image: "equalitie/baskervillehall:session" + image: "equalitie/baskervillehall:session_dev" imagePullPolicy: Always command: - python @@ -40,15 +40,15 @@ spec: - name: POSTGRES_USER valueFrom: secretKeyRef: - name: postgres-secret - key: POSTGRES_USER + name: postgres-baskervillehall-dev-secret + key: user - name: POSTGRES_PASSWORD valueFrom: secretKeyRef: - name: postgres-secret - key: POSTGRES_PASSWORD + name: postgres-baskervillehall-dev-secret + key: password - name: POSTGRES_DATABASE_NAME valueFrom: secretKeyRef: - name: postgres-secret - key: POSTGRES_DATABASE_NAME + name: postgres-baskervillehall-dev-secret + key: database