Merge pull request #70 from h-munakata/main

h-munakata · web-flow · commit 629bc6790c66 · 2025-11-20T15:29:56.000+09:00
Add CASTELLA
diff --git a/README.md b/README.md
@@ -12,6 +12,7 @@ It supports seven models, four features (video and audio features), and six data
 Furthermore, Lighthouse supports [audio moment retrieval](https://h-munakata.github.io/Language-based-Audio-Moment-Retrieval/), a task to identify relevant moments from an audio input based on a given text query.
 
 ## News
+- [2025/11/20] [Version 1.2](https://github.com/line/lighthouse/releases/tag/v1.2) Our work ["CASTELLA: Long Audio Dataset with Captions and Temporal Boundaries"](https://arxiv.org/abs/2511.15131) has been released. This update adds support for a new AMR dataset called CASTELLA.
 - [2025/06/04] [Version 1.1](https://github.com/line/lighthouse/releases/tag/v1.1) has been released. It includes API changes, AMR gradio demo, and huggingface wrappers for the audio moment retrieval and clotho dataset.
 - [2024/12/24] Our work ["Language-based audio moment retrieval"](https://arxiv.org/abs/2409.15672) has been accepted at ICASSP 2025.
 - [2024/10/22] [Version 1.0](https://github.com/line/lighthouse/releases/tag/v1.0) has been released.
@@ -142,6 +143,7 @@ Audio moment retrieval
 ### Pre-trained weights
 Pre-trained weights can be downloaded from [here](https://drive.google.com/file/d/1jxs_bvwttXTF9Lk3aKLohkqfYOonLyrO/view?usp=sharing).
 Download and unzip on the home directory.
+AMR models trained on CASTELLA and Clotho-Moment is available in [here](https://zenodo.org/uploads/17422909)
 
 ### Datasets
 Due to the copyright issue, we here distribute only feature files.
@@ -158,6 +160,7 @@ To extract features from videos, we use [HERO_Video_Feature_Extractor](https://g
 For [AMR](https://h-munakata.github.io/Language-based-Audio-Moment-Retrieval/), download features from here.
 
 - [Clotho Moment/TUT2017/UnAV100-subset](https://zenodo.org/records/13806234)
+- [CASTELLA](https://zenodo.org/records/17412176) [[Mirror on HF]](https://huggingface.co/datasets/lighthouse-emnlp2024/CASTELLA_CLAP_features)
 
 The whole directory should be look like this:
 ```
diff --git a/configs/base.yml b/configs/base.yml
@@ -12,6 +12,7 @@ eval_bsz: 100
 grad_clip: 0.1
 max_q_l: 32
 max_v_l: 75
+max_a_l: 75
 max_windows: 5
 clip_length: 1
 eval_epoch_interval: 1
diff --git a/configs/dataset/castella.yml b/configs/dataset/castella.yml
@@ -0,0 +1,7 @@
+dset_name: castella
+clip_length: 1
+train_path: data/castella/castella_train_release.jsonl
+eval_path: data/castella/castella_val_release.jsonl
+
+max_a_l: 300
+max_v_l: 300
diff --git a/data/castella/castella_test_release.jsonl b/data/castella/castella_test_release.jsonl
diff --git a/data/castella/castella_train_release.jsonl b/data/castella/castella_train_release.jsonl
diff --git a/data/castella/castella_val_release.jsonl b/data/castella/castella_val_release.jsonl
diff --git a/training/cg_detr_dataset.py b/training/cg_detr_dataset.py
@@ -196,7 +196,7 @@ def __getitem__(self, index):
                         else:
                             model_inputs["saliency_pos_labels"], model_inputs["saliency_neg_labels"], model_inputs["saliency_all_labels"] = \
                                 self.get_saliency_labels_all(meta["relevant_clip_ids"], meta["saliency_scores"], ctx_l)
-                    elif self.dset_name in ['charades', 'tacos', 'activitynet', 'clotho-moment', 'unav100-subset', 'tut2017']: ## charades, tacos, nlq
+                    elif self.dset_name in ['charades', 'tacos', 'activitynet', 'clotho-moment', 'unav100-subset', 'tut2017', 'castella']: ## charades, tacos, nlq
                         model_inputs["saliency_pos_labels"], model_inputs["saliency_neg_labels"], model_inputs["saliency_all_labels"] = \
                             self.get_saliency_labels_sub_as_query(meta["relevant_windows"][0], meta["duration"], ctx_l)  # only one gt
                     else:
@@ -458,7 +458,7 @@ def _get_audio_feat_by_vid(self, vid):
                     raise NotImplementedError
                 _feat = l2_normalize_np_array(_feat) # normalize?
                 a_feat_list.append(_feat)
-            elif self.dset_name in ['clotho-moment', 'unav100-subset', 'tut2017']:
+            elif self.dset_name in ['clotho-moment', 'unav100-subset', 'tut2017', 'castella']:
                 if self.a_feat_types == "clap":
                     _feat_path = join(_feat_dir, f"{vid}.npz")
                     _feat = np.load(_feat_path)["features"][:self.max_a_l].astype(np.float32)
diff --git a/training/dataset.py b/training/dataset.py
@@ -212,7 +212,7 @@ def __getitem__(self, index):
                         model_inputs["saliency_pos_labels"], model_inputs["saliency_neg_labels"], model_inputs["saliency_all_labels"] = \
                             self.get_saliency_labels_all(meta["relevant_clip_ids"], meta["saliency_scores"], ctx_l)                        
                 
-                elif self.dset_name in ['charades', 'tacos', 'activitynet', 'clotho-moment', 'unav100-subset', 'tut2017']:
+                elif self.dset_name in ['charades', 'tacos', 'activitynet', 'clotho-moment', 'unav100-subset', 'tut2017', 'castella']:
                     model_inputs["saliency_pos_labels"], model_inputs["saliency_neg_labels"], model_inputs["saliency_all_labels"] = \
                         self.get_saliency_labels_sub_as_query(meta["relevant_windows"][0], ctx_l)
                 else:
@@ -480,7 +480,7 @@ def _get_audio_feat_by_vid(self, vid):
                     raise NotImplementedError
                 _feat = l2_normalize_np_array(_feat) # normalize?
                 a_feat_list.append(_feat)
-            elif self.dset_name in ['clotho-moment', 'unav100-subset', 'tut2017']:
+            elif self.dset_name in ['clotho-moment', 'unav100-subset', 'tut2017', 'castella']:
                 if self.a_feat_types == "clap":
                     _feat_path = join(_feat_dir, f"{vid}.npz")
                     _feat = np.load(_feat_path)["features"][:self.max_a_l].astype(np.float32)
diff --git a/training/evaluate.py b/training/evaluate.py
@@ -261,6 +261,12 @@ def compute_mr_results(epoch_i, model, eval_loader, opt, criterion=None):
             min_w_l=2, max_w_l=60, move_window_method="left",
             process_func_names=("clip_ts", "round_multiple")
         )
+    elif opt.dset_name in ['castella']:
+        post_processor = PostProcessorDETR(
+            clip_length=opt.clip_length, min_ts_val=0, max_ts_val=300,
+            min_w_l=1, max_w_l=300, move_window_method="left",
+            process_func_names=("clip_ts", "round_multiple")
+        )
     elif opt.dset_name in ['tacos', 'activitynet', 'youtube_highlight']:
         post_processor = PostProcessorDETR(
             clip_length=opt.clip_length, min_ts_val=0, max_ts_val=50000,
@@ -367,6 +373,7 @@ def start_inference(opt, domain=None):
         a_feat_types=opt.a_feat_types,
         max_q_l=opt.max_q_l,
         max_v_l=opt.max_v_l,
+        max_a_l=opt.max_a_l,
         clip_len=opt.clip_length,
         max_windows=opt.max_windows,
         span_loss_type=opt.span_loss_type,
@@ -375,7 +382,7 @@ def start_inference(opt, domain=None):
     
     eval_dataset = CGDETR_StartEndDataset(**dataset_config) if opt.model_name == 'cg_detr' else StartEndDataset(**dataset_config)
     model, criterion, _, _ = setup_model(opt)
-    checkpoint = torch.load(opt.model_path)
+    checkpoint = torch.load(opt.model_path, weights_only=False)
     model.load_state_dict(checkpoint["model"])
     logger.info("Model checkpoint: {}".format(opt.model_path))
     if not load_labels:
@@ -402,6 +409,8 @@ def check_valid_combination(dataset, feature, domain):
         'tvsum': ['resnet_glove', 'clip', 'clip_slowfast', 'i3d_clip'],
         'youtube_highlight': ['clip', 'clip_slowfast'],
         'clotho-moment': ['clap'],
+        'unav100-subset': ['clap'],
+        'castella': ['clap'],
     }
 
     domain_map = {
@@ -421,8 +430,8 @@ def check_valid_combination(dataset, feature, domain):
                         choices=['moment_detr', 'qd_detr', 'eatr', 'cg_detr', 'uvcom', 'tr_detr', 'taskweave_hd2mr', 'taskweave_mr2hd'],
                         help='model name. select from [moment_detr, qd_detr, eatr, cg_detr, uvcom, tr_detr, taskweave_hd2mr, taskweave_mr2hd]')
     parser.add_argument('--dataset', '-d', type=str, required=True,
-                        choices=['activitynet', 'charades', 'qvhighlight', 'qvhighlight_pretrain', 'tacos', 'tvsum', 'youtube_highlight', 'clotho-moment', 'unav100-subset', 'tut2017'],
-                        help='dataset name. select from [activitynet, charades, qvhighlight, qvhighlight_pretrain, tacos, tvsum, youtube_highlight, clotho-moment, unav100-subset, tut2017]')
+                        choices=['activitynet', 'charades', 'qvhighlight', 'qvhighlight_pretrain', 'tacos', 'tvsum', 'youtube_highlight', 'clotho-moment', 'unav100-subset', 'tut2017', 'castella'],
+                        help='dataset name. select from [activitynet, charades, qvhighlight, qvhighlight_pretrain, tacos, tvsum, youtube_highlight, clotho-moment, unav100-subset, tut2017, castella]')
     parser.add_argument('--feature', '-f', type=str, required=True,
                         choices=['resnet_glove', 'clip', 'clip_slowfast', 'clip_slowfast_pann', 'i3d_clip', 'clap'],
                         help='feature name. select from [resnet_glove, clip, clip_slowfast, clip_slowfast_pann, i3d_clip, clap].'
diff --git a/training/train.py b/training/train.py
@@ -137,7 +137,6 @@ def train_epoch(model, criterion, train_loader, optimizer, opt, epoch_i):
             losses.backward()
         else:
             outputs = model(**model_inputs, targets=targets) if opt.model_name == 'cg_detr' else model(**model_inputs)
-            
             loss_dict = criterion(outputs, targets)
             losses = sum(loss_dict[k] * criterion.weight_dict[k] for k in loss_dict.keys() if k in criterion.weight_dict)
             
@@ -228,6 +227,7 @@ def main(opt, resume=None, domain=None):
         a_feat_types=opt.a_feat_types,
         max_q_l=opt.max_q_l,
         max_v_l=opt.max_v_l,
+        max_a_l=opt.max_a_l,
         clip_len=opt.clip_length,
         max_windows=opt.max_windows,
         span_loss_type=opt.span_loss_type,
@@ -246,7 +246,7 @@ def main(opt, resume=None, domain=None):
     
     # load checkpoint for QVHighlight pretrain -> finetune
     if resume is not None:
-        checkpoint = torch.load(resume)
+        checkpoint = torch.load(resume, weights_only=False)
         model.load_state_dict(checkpoint["model"])
         logger.info("Loaded model checkpoint: {}".format(resume))
     
@@ -267,6 +267,7 @@ def check_valid_combination(dataset, feature, domain):
         'tvsum': ['resnet_glove', 'clip', 'clip_slowfast', 'i3d_clip'],
         'youtube_highlight': ['clip', 'clip_slowfast'],
         'clotho-moment': ['clap'],
+        'castella': ['clap'],
     }
 
     domain_map = {
@@ -286,8 +287,8 @@ def check_valid_combination(dataset, feature, domain):
                         choices=['moment_detr', 'qd_detr', 'eatr', 'cg_detr', 'uvcom', 'tr_detr', 'taskweave_hd2mr', 'taskweave_mr2hd'],
                         help='model name. select from [moment_detr, qd_detr, eatr, cg_detr, uvcom, tr_detr, taskweave_hd2mr, taskweave_mr2hd]')
     parser.add_argument('--dataset', '-d', type=str, required=True,
-                        choices=['activitynet', 'charades', 'qvhighlight', 'qvhighlight_pretrain', 'tacos', 'tvsum', 'youtube_highlight', 'clotho-moment'],
-                        help='dataset name. select from [activitynet, charades, qvhighlight, qvhighlight_pretrain, tacos, tvsum, youtube_highlight, clotho-moment]')
+                        choices=['activitynet', 'charades', 'qvhighlight', 'qvhighlight_pretrain', 'tacos', 'tvsum', 'youtube_highlight', 'clotho-moment', 'castella'],
+                        help='dataset name. select from [activitynet, charades, qvhighlight, qvhighlight_pretrain, tacos, tvsum, youtube_highlight, clotho-moment, castella]')
     parser.add_argument('--feature', '-f', type=str, required=True,
                         choices=['resnet_glove', 'clip', 'clip_slowfast', 'clip_slowfast_pann', 'i3d_clip', 'clap'],
                         help='feature name. select from [resnet_glove, clip, clip_slowfast, clip_slowfast_pann, i3d_clip, clap].'