Skip to content

Commit 1000148

Browse files
fix: add UTF-8 encoding to file read/write operations across multiple modules (#142)
1 parent c44a1ce commit 1000148

File tree

4 files changed

+17
-17
lines changed

4 files changed

+17
-17
lines changed

mostlyai/qa/_filesystem.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -157,11 +157,11 @@ def is_early_exit(self) -> bool:
157157
return self.early_exit_path.exists()
158158

159159
def store_meta(self, meta: dict):
160-
with open(self.meta_path, "w") as file:
160+
with open(self.meta_path, "w", encoding="utf-8") as file:
161161
json.dump(meta, file)
162162

163163
def load_meta(self) -> dict:
164-
with open(self.meta_path) as file:
164+
with open(self.meta_path, encoding="utf-8") as file:
165165
return json.load(file)
166166

167167
def store_bins(self, bins: dict[str, list]) -> None:

mostlyai/qa/_html_report.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -35,21 +35,21 @@
3535

3636
def get_uni_htmls(acc_uni: pd.DataFrame, workspace: TemporaryWorkspace) -> list[str]:
3737
paths_uni = workspace.get_figure_paths("univariate", acc_uni[["column"]]).values()
38-
return [f.read_text() for f in paths_uni]
38+
return [f.read_text(encoding="utf-8") for f in paths_uni]
3939

4040

4141
def get_cats_per_seq_htmls(acc_cats_per_seq: pd.DataFrame, workspace: TemporaryWorkspace) -> list[str]:
4242
paths_cats_per_seq = workspace.get_figure_paths(
4343
"distinct_categories_per_sequence", acc_cats_per_seq[["column"]]
4444
).values()
45-
return [f.read_text() for f in paths_cats_per_seq]
45+
return [f.read_text(encoding="utf-8") for f in paths_cats_per_seq]
4646

4747

4848
def get_seqs_per_cat_htmls(acc_seqs_per_cat: pd.DataFrame, workspace: TemporaryWorkspace) -> list[str]:
4949
paths_seqs_per_cat = workspace.get_figure_paths(
5050
"sequences_per_distinct_category", acc_seqs_per_cat[["column"]]
5151
).values()
52-
return [f.read_text() for f in paths_seqs_per_cat]
52+
return [f.read_text(encoding="utf-8") for f in paths_seqs_per_cat]
5353

5454

5555
def get_biv_htmls(acc_biv: pd.DataFrame, workspace: TemporaryWorkspace) -> tuple[list[str], list[str], list[str]]:
@@ -59,9 +59,9 @@ def get_biv_htmls(acc_biv: pd.DataFrame, workspace: TemporaryWorkspace) -> tuple
5959
paths_biv_ctx = workspace.get_figure_paths("bivariate", acc_biv_ctx[["col1", "col2"]]).values()
6060
paths_biv_tgt = workspace.get_figure_paths("bivariate", acc_biv_tgt[["col1", "col2"]]).values()
6161
paths_biv_nxt = workspace.get_figure_paths("bivariate", acc_biv_nxt[["col1", "col2"]]).values()
62-
html_biv_ctx = [f.read_text() for f in paths_biv_ctx]
63-
html_biv_tgt = [f.read_text() for f in paths_biv_tgt]
64-
html_biv_nxt = [f.read_text() for f in paths_biv_nxt]
62+
html_biv_ctx = [f.read_text(encoding="utf-8") for f in paths_biv_ctx]
63+
html_biv_tgt = [f.read_text(encoding="utf-8") for f in paths_biv_tgt]
64+
html_biv_nxt = [f.read_text(encoding="utf-8") for f in paths_biv_nxt]
6565
return html_biv_ctx, html_biv_tgt, html_biv_nxt
6666

6767

@@ -92,14 +92,14 @@ def store_report(
9292
acc_biv = filter_biv_acc_for_plotting(acc_biv, corr_trn)
9393
html_biv_ctx, html_biv_tgt, html_biv_nxt = get_biv_htmls(acc_biv=acc_biv, workspace=workspace)
9494

95-
correlation_matrix_html_chart = workspace.get_unique_figure_path("correlation_matrices").read_text()
95+
correlation_matrix_html_chart = workspace.get_unique_figure_path("correlation_matrices").read_text(encoding="utf-8")
9696
similarity_pca_html_chart_path = workspace.get_unique_figure_path("similarity_pca")
9797
similarity_pca_html_chart = None
9898
if similarity_pca_html_chart_path.exists():
99-
similarity_pca_html_chart = similarity_pca_html_chart_path.read_text()
99+
similarity_pca_html_chart = similarity_pca_html_chart_path.read_text(encoding="utf-8")
100100
if report_type == "model_report":
101-
accuracy_matrix_html_chart = workspace.get_unique_figure_path("accuracy_matrix").read_text()
102-
distances_dcr_html_chart = workspace.get_unique_figure_path("distances_dcr").read_text()
101+
accuracy_matrix_html_chart = workspace.get_unique_figure_path("accuracy_matrix").read_text(encoding="utf-8")
102+
distances_dcr_html_chart = workspace.get_unique_figure_path("distances_dcr").read_text(encoding="utf-8")
103103
else:
104104
accuracy_matrix_html_chart = None
105105
distances_dcr_html_chart = None
@@ -127,7 +127,7 @@ def store_report(
127127
bivariate_html_charts_ctx=html_biv_ctx,
128128
bivariate_html_charts_nxt=html_biv_nxt,
129129
)
130-
report_path.write_text(html)
130+
report_path.write_text(html, encoding="utf-8")
131131

132132

133133
def summarize_accuracies_by_column(
@@ -175,4 +175,4 @@ def summarize_accuracies_by_column(
175175
def store_early_exit_report(report_path: Path):
176176
template = Environment(loader=FileSystemLoader(HTML_ASSETS_PATH)).get_template(HTML_REPORT_EARLY_EXIT)
177177
report_html = template.render(html_assets=read_html_assets(), meta={})
178-
report_path.write_text(report_html)
178+
report_path.write_text(report_html, encoding="utf-8")

mostlyai/qa/assets/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,8 @@
2828
HTML_REPORT_EARLY_EXIT = "report_early_exit.html"
2929

3030

31-
def read_html_assets():
32-
return {fn: Path(HTML_ASSETS_PATH / fn).read_text() for fn in _HTML_ASSET_FILES}
31+
def read_html_assets() -> dict[str, str]:
32+
return {fn: Path(HTML_ASSETS_PATH / fn).read_text(encoding='utf-8') for fn in _HTML_ASSET_FILES}
3333

3434

3535
def load_tokenizer():

tests/unit/test_html_report.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ def test_generate_store_report(tmp_path, cols, workspace):
6666
)
6767
for path in plot_paths:
6868
path.parent.mkdir(parents=True, exist_ok=True)
69-
path.write_text("<div></div>")
69+
path.write_text("<div></div>", encoding="utf-8")
7070

7171
metrics = _calculate_metrics(
7272
acc_uni=acc_uni,

0 commit comments

Comments
 (0)