-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscan_iac_files.py
More file actions
155 lines (130 loc) · 5.77 KB
/
scan_iac_files.py
File metadata and controls
155 lines (130 loc) · 5.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import logging
import re
import sqlite3 as sl
import time
import datetime
import requests
import os
from github import Github, GithubException
from github.ContentFile import ContentFile
gh_api_token = os.environ.get('GH_TOKEN')
re_cfn = re.compile("Resources\:.*AWS", re.DOTALL)
re_helm = re.compile("(namespace: kube-system|name: pod-exec|kind: ClusterRole|apiVersion: .*.k8s.io|apiVersion: v1)")
re_pulumi = re.compile("(P|p)ulumi")
logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)
def determine_iac_type(filename, content):
if filename.endswith(".tf"):
return "tf"
elif (re_cfn.search(content)):
return "cfn"
elif (re_helm.search(content)):
return "helm"
elif filename.endswith(".py") and re_pulumi.search(content):
return "pulumi"
return None
def check_and_wait_for_limits(g):
search_ratelimit = g.get_rate_limit().search
if search_ratelimit.remaining == 0:
seconds_to_sleep = (search_ratelimit.reset - datetime.datetime.utcnow()).total_seconds() + 5 # In case of clock sync issues
logging.warning (f"Hit search rate limit, sleeping {seconds_to_sleep} seconds...")
time.sleep(seconds_to_sleep)
core_ratelimit = g.get_rate_limit().core
if core_ratelimit.remaining == 0:
seconds_to_sleep = (core_ratelimit.reset - datetime.datetime.utcnow()).total_seconds() + 5 # In case of clock sync issues
logging.warning (f"Hit core rate limit, sleeping {seconds_to_sleep} seconds...")
time.sleep(seconds_to_sleep)
def fetch_iac():
g = Github(gh_api_token)
con = sl.connect('gh-file-iac-classification.db')
# with con:
# con.execute("""
# CREATE TABLE FILE (
# url TEXT NOT NULL PRIMARY KEY,
# git_url TEXT,
# html_url TEXT,
# repository_giturl TEXT,
# iac TEXT
# );
# """)
# con.execute("""
# CREATE TABLE ORG (
# login TEXT NOT NULL PRIMARY KEY
# );
# """)
check_and_wait_for_limits(g)
for org in g.get_organizations():
seen_org_before = False
with con:
sqlcmd_find_org = """
select * from ORG where login =?;
"""
seen_org_before = con.execute(sqlcmd_find_org, (org.login,)).fetchone()
if seen_org_before:
logging.info(f"Skipping org: {org.login}")
continue
else:
logging.info(f"Searching org: {org.login}")
search_org_code(g, con, org, f"Resources in:file language:yaml org:{org.login}") # CFN
search_org_code(g, con, org, f"apiVersion in:file language:yaml org:{org.login}") # Helm
search_org_code(g, con, org, f"resource in:file language:hcl org:{org.login}") # HCL
search_org_code(g, con, org, f"pulumi in:file language:python org:{org.login}") # Pulumi
check_and_wait_for_limits(g)
with con:
sqlcmd_insert_org_info = """
INSERT INTO ORG (login) values(?)
ON CONFLICT(login) DO NOTHING;
"""
con.execute(sqlcmd_insert_org_info, (org.login,))
con.commit()
def search_org_code(g, con, org, query):
code_search = g.search_code(query)
code_search_iterator = None
files_reviewed = 0
try:
while files_reviewed < code_search.totalCount:
file = None
while not file:
try:
if not code_search_iterator:
# Need to initilize the iterator every once in a while because every
# page in the results returns its own set of elements
logging.info("Getting new code search iterator")
code_search_iterator = iter(code_search)
file = next(code_search_iterator)
except GithubException as e:
logging.exception("Hit Github Exception")
if e.headers.get("Retry-After", None):
logging.info("Sleeping as requested.")
time.sleep(int(e.headers["Retry-After"]))
else:
logging.info("Sleeping a bit (no specific time specified GH).")
time.sleep(60)
except StopIteration:
# Need a the next page
code_search_iterator = iter(code_search)
try:
files_reviewed += 1
iac_type = determine_iac_type(file.html_url, file.decoded_content.decode('utf-8'))
if iac_type:
logging.info(f"#{files_reviewed}: {file.html_url} is {iac_type}")
else:
logging.info(f"#{files_reviewed}: Failed to determine IaC type for {file.html_url}")
with con:
sqlcmd_insert_file_iac_info = """
INSERT INTO FILE (url, git_url, html_url, repository_giturl, iac) values(?, ?, ?, ?, ?)
ON CONFLICT(url) DO UPDATE SET iac=excluded.iac;
"""
con.execute(sqlcmd_insert_file_iac_info, (file.url, file.git_url, file.html_url, file.repository.git_url, iac_type))
con.commit()
except Exception:
logging.exception(f"Issue handling {file.url}")
check_and_wait_for_limits(g)
except GithubException as e:
logging.debug("Generic GithubException")
except requests.exceptions.ReadTimeout:
logging.error("ReadTimeout, continuing to the next one")
except requests.exceptions.ConnectionError:
logging.error("ConnectionError, continuing to the next one")
if __name__ == '__main__':
fetch_iac()