ziplab · xuao575 · Jul 30, 2024
diff --git a/longvlm/eval/run_inference_benchmark.py b/longvlm/eval/run_inference_benchmark.py
@@ -5,7 +5,6 @@
 import json
 from tqdm import tqdm
 import pickle
-from longvlm.eval.model_utils import initialize_model
 from transformers import AutoTokenizer
 from longvlm.utils import disable_torch_init
 from longvlm.constants import *
@@ -56,15 +55,20 @@ def initialize_model(llm_model, model_name, projection_path=None): #, args=None)
     Returns:
     tuple: Model, vision tower, tokenizer, image processor, vision config, and video token length.
     """
+    # Dynamically check is needed
+    def get_device_map() -> str:
+        return 'cuda' if torch.cuda.is_available() else 'cpu'
+    device = get_device_map()
+
     # Disable initial torch operations
     disable_torch_init()
 
     # Convert model name to user path
     model_name = os.path.expanduser(model_name)
     # Load tokenizer
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    tokenizer = AutoTokenizer.from_pretrained(model_name, device_map=device)
     # Load model
-    model = model_dict[llm_model].from_pretrained(model_name, low_cpu_mem_usage=True, torch_dtype=torch.float16, use_cache=True)
+    model = model_dict[llm_model].from_pretrained(model_name, low_cpu_mem_usage=True, torch_dtype=torch.float16, use_cache=True, device_map=device)
     # print(model)
     mm_use_vid_start_end = True
     # Add tokens to tokenizer
@@ -76,9 +80,10 @@ def initialize_model(llm_model, model_name, projection_path=None): #, args=None)
     model.resize_token_embeddings(len(tokenizer))
 
     # Load the weights from projection_path after resizing the token_embeddings
+
     if projection_path:
         print(f"Loading weights from {projection_path}")
-        status = model.load_state_dict(torch.load(projection_path, map_location='cpu'), strict=False)
+        status = model.load_state_dict(torch.load(projection_path, map_location=device), strict=False)
         if status.unexpected_keys:
             print(f"Unexpected Keys: {status.unexpected_keys}.\nThe model weights are not loaded correctly.")
         print(f"Weights loaded from {projection_path}")

diff --git a/run.sh b/run.sh
@@ -80,7 +80,8 @@ python longvlm/eval/run_inference_benchmark.py \
     --gt_file ${GT_FILE} \
     --output_dir ${OUTPUT_DIR} \
     --output_name anet_generic_qa \
-    --model-name ${PRETRAINED_PATH}
+    --model-name ${PRETRAINED_PATH} \
+    --projection_path ${PROJ_PATH}
 
 
 ### FOR evaluation

diff --git a/scripts/save_features.py b/scripts/save_features.py
@@ -8,7 +8,7 @@
 from tqdm import tqdm
 from decord import VideoReader, cpu
 from transformers import CLIPVisionModel, CLIPImageProcessor
-from longvlm.merge import merge_tokens 
+from longvlm.model.merge import merge_tokens
 
 
 
@@ -58,6 +58,7 @@ def parse_args():
     parser.add_argument("--clip_feat_path_memory", required=True, help="The output dir to save the memory features.")
     parser.add_argument("--pretrained_path", default="./pretrained/clip-vit-large-patch14", help="Path to load the model config from." )
     parser.add_argument("--list_file", default="./datasets/anet/v1-2_val_subset_split1.txt", help="Path to the video list." )
+    parser.add_argument("--infer_batch", default=1, help="Inference batch size." )
     args = parser.parse_args()
 
     return args