diff --git a/gpu_bdb/bdb_tools/cluster_startup.py b/gpu_bdb/bdb_tools/cluster_startup.py index de078717..378c1036 100755 --- a/gpu_bdb/bdb_tools/cluster_startup.py +++ b/gpu_bdb/bdb_tools/cluster_startup.py @@ -103,12 +103,13 @@ def maybe_create_worker_directories(dask_worker): # Assumes all GPUs are the same size expected_workers = int(os.environ.get("NUM_WORKERS", 16)) worker_counts = worker_count_info(client) + current_workers = None for gpu_size, count in worker_counts.items(): if count != 0: current_workers = worker_counts.pop(gpu_size) break - - if expected_workers is not None and expected_workers != current_workers: + + if expected_workers is not None and current_workers is not None and expected_workers != current_workers: print( f"Expected {expected_workers} {gpu_size} workers in your cluster, but got {current_workers}. It can take a moment for all workers to join the cluster. You may also have misconfigred hosts." ) diff --git a/gpu_bdb/bdb_tools/q27_bert_utils.py b/gpu_bdb/bdb_tools/q27_bert_utils.py index 0d28e6f3..ebf0a8a5 100755 --- a/gpu_bdb/bdb_tools/q27_bert_utils.py +++ b/gpu_bdb/bdb_tools/q27_bert_utils.py @@ -172,23 +172,30 @@ def tokenize_text_series(text_ser, seq_len, stride, vocab_hash_file): A dictionary with these keys {'token_ar':,'attention_ar':,'metadata':} """ + from cudf._lib.nvtext.subword_tokenize import Hashed_Vocabulary as cpp_hashed_vocabulary + from cudf._lib.nvtext.subword_tokenize import subword_tokenize_inmem_hash as cpp_subword_tokenize + + vocab_file = cpp_hashed_vocabulary(vocab_hash_file) + if len(text_ser) == 0: return {"token_ar": None, "attention_ar": None, "metadata": None} - max_num_chars = text_ser.str.len().sum() + 1 - max_rows_tensor = len(text_ser) * 2 + # max_num_chars = text_ser.str.len().sum() + 1 + # max_rows_tensor = len(text_ser) * 2 max_length = seq_len - 2 - tokens, attention_masks, metadata = text_ser.str.subword_tokenize( - vocab_hash_file, - do_lower=False, - max_num_strings=max_rows_tensor, - max_rows_tensor=max_rows_tensor, - max_num_chars=max_num_chars, - stride=stride, - max_length=max_length, - do_truncate=False, - ) + tokens, attention_mask, metadata = cpp_subword_tokenize( + text_ser._column, + vocab_file, + do_lower=False, + stride=stride, + max_sequence_length=max_length, + do_truncate=False, + ) + + tokens = cp.asarray(tokens) + attention_masks = cp.asarray(attention_mask) + metadata = cp.asarray(metadata) del text_ser ### reshape metadata into a matrix metadata = metadata.reshape(-1, 3) @@ -301,7 +308,7 @@ def create_vocab_table(vocabpath): """ id2vocab = [] vocab2id = {} - with open(vocabpath) as f: + with open(vocabpath, encoding="utf-8") as f: for index, line in enumerate(f): token = line.split()[0] id2vocab.append(token) diff --git a/gpu_bdb/bdb_tools/q27_get_review_sentence_utils.py b/gpu_bdb/bdb_tools/q27_get_review_sentence_utils.py index b7ce3e21..1b7946d8 100755 --- a/gpu_bdb/bdb_tools/q27_get_review_sentence_utils.py +++ b/gpu_bdb/bdb_tools/q27_get_review_sentence_utils.py @@ -233,6 +233,7 @@ def get_org_sentences(sentence_boundary_df, org_df): inplace=True, ) valid_left_loc = valid_left_loc.reset_index(drop=False) + valid_left_loc = valid_left_loc.rename(columns={"index":"flat_loc_org"}) ### Better way to get the closeset row/col maybe valid_right_loc = ( @@ -249,8 +250,9 @@ def get_org_sentences(sentence_boundary_df, org_df): valid_right_loc = valid_right_loc[["r_fs_seq_row", "r_fs_seq_col"]].reset_index( drop=False ) + valid_right_loc = valid_right_loc.rename(columns={"index":"flat_loc_org"}) - valid_df = valid_left_loc.merge(valid_right_loc) + valid_df = valid_left_loc.merge(valid_right_loc, on="flat_loc_org") valid_df = valid_df.set_index(["flat_loc_org"]) return valid_df @@ -352,3 +354,4 @@ def get_org_df(pr_label_f, metadata_df, seq_len): org_df = org_df[flag] return org_df[["org_seq_row", "org_seq_col", "input_text_index", "flat_loc_org"]] + diff --git a/gpu_bdb/queries/q27/gpu_bdb_query_hf_27.py b/gpu_bdb/queries/q27/gpu_bdb_query_hf_27.py index 312eb200..284c75fd 100755 --- a/gpu_bdb/queries/q27/gpu_bdb_query_hf_27.py +++ b/gpu_bdb/queries/q27/gpu_bdb_query_hf_27.py @@ -119,12 +119,11 @@ def main(client, config): import cudf - model_path = os.path.join(config["data_dir"], "../../q27_model_dir") + model_path = os.path.join(config["data_dir"], "/datasets/vjawa/distilbert-base-en-cased/") product_reviews_df = benchmark( read_tables, config=config, compute_result=config["get_read_time"], - dask_profile=config["dask_profile"], ) product_reviews_df = product_reviews_df[ product_reviews_df.pr_item_sk == q27_pr_item_sk @@ -142,7 +141,8 @@ def main(client, config): ) output_df = output_df.persist() wait(output_df) - client.run(del_model_attribute) + print(output_df.head()) + #client.run(del_model_attribute) return output_df @@ -153,5 +153,6 @@ def main(client, config): config = gpubdb_argparser() client, bc = attach_to_cluster(config) - client.run(rmm.reinitialize, pool_allocator=True, initial_pool_size=14e9) + + #client.run(rmm.reinitialize, pool_allocator=True, initial_pool_size=14e9) run_query(config=config, client=client, query_func=main)