From f470358340596339334bda3b268f26c392764853 Mon Sep 17 00:00:00 2001 From: Akshaj Verma Date: Thu, 9 Dec 2021 23:11:59 +0530 Subject: [PATCH 1/8] Add new file. --- pytorchvideo/neural_engine/test.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 pytorchvideo/neural_engine/test.py diff --git a/pytorchvideo/neural_engine/test.py b/pytorchvideo/neural_engine/test.py new file mode 100644 index 00000000..e69de29b From d4a572852cc2780acd22cf38b0cb38112024e0d0 Mon Sep 17 00:00:00 2001 From: Akshaj Verma Date: Mon, 13 Dec 2021 20:13:33 +0530 Subject: [PATCH 2/8] Initial commit. --- pytorchvideo/neural_engine/retrieval_hook.py | 104 +++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 pytorchvideo/neural_engine/retrieval_hook.py diff --git a/pytorchvideo/neural_engine/retrieval_hook.py b/pytorchvideo/neural_engine/retrieval_hook.py new file mode 100644 index 00000000..f804e51c --- /dev/null +++ b/pytorchvideo/neural_engine/retrieval_hook.py @@ -0,0 +1,104 @@ +import cv2 +import torch + + +def process_video(video_path, model, max_frames): + video = cv2.VideoCapture(video_path) + + frame_count = 0 + + frame_tracker = [] + + while True: + is_frame, frame = video.read() + + if not is_frame: + break + + # generate model output + model_outputs = model.predict(frame) + + # get bounding boxes (x1, y1, x2, y2) + bboxes_per_frame = model_outputs["instances"][ + model_outputs["instances"].pred_classes == 0 + ].pred_boxes + bboxes_per_frame = bboxes_per_frame.tensor.to("cpu").squeeze() + + # calculate bbox center (x1, y1, x2, y2) + bboxes_per_frame_center_x = ( + bboxes_per_frame[:, 0] + bboxes_per_frame[:, 2] + ) / 2 # (x1+x2)/2 + bboxes_per_frame_center_y = ( + bboxes_per_frame[:, 1] + bboxes_per_frame[:, 3] + ) / 2 # (y1+y2)/2 + + # get keypoints + keypoints_per_frame = model_outputs["instances"][ + model_outputs["instances"].pred_classes == 0 + ].pred_keypoints + keypoints_per_frame = keypoints_per_frame[:, :, :2].to("cpu") + + # change origin of the keypoints to center of each bbox + keypoints_per_frame[:, :, 0] = keypoints_per_frame[ + :, :, 0 + ] - bboxes_per_frame_center_x.unsqueeze(1) + keypoints_per_frame[:, :, 1] = keypoints_per_frame[ + :, :, 1 + ] - bboxes_per_frame_center_y.unsqueeze(1) + + for i in range(bboxes_per_frame.shape[0]): + bbox_coord = bboxes_per_frame[i, :] + keypoint_per_bbox = keypoints_per_frame[i, :, :] + + bbox_info = { + "bbox_id": i, + "person_id": None, + "frame_id": frame_count, + "bbox_coord": bbox_coord, + "keypoint_coord": keypoint_per_bbox, + } + + frame_tracker.append(bbox_info) + + frame_count += 1 + + if frame_count == max_frames: + break + + video.release() + cv2.destroyAllWindows() + + return frame_tracker + + +def create_keypoint_features_db(frame_tracker): + return torch.stack([bbox["keypoint_coord"].flatten() for bbox in frame_tracker]) + + +def calculate_distance(action_query, keypoint_db): + scores = action_query @ keypoint_db.T + return scores + + +def get_closest_matches(scores, method, n): + if method == "topk": + return torch.topk(scores, n).indices.squeeze() + elif method == "softmax": + score_probs = torch.nn.functional.softmax(scores, dim=1) + return (score_probs > n).squeeze().nonzero() + + +## workflow + +# frame_tracker = process_video( +# video_path="./sample_video.mp4", model=keypoint_detection_model, max_frames=5 +# ) +# +# keypoint_db = create_keypoint_features_db(frame_tracker) +# +# scores = calculate_distance(action_query, keypoint_db) +# +# best_bbox_matches = get_closest_matches(scores, method="topk", n=10) +# +# for bbox_id in best_bbox_matches.tolist(): +# print(frame_tracker[bbox_id]["frame_id"]) From 5917713881508071a29bf7a8705d5e8a5ff2f334 Mon Sep 17 00:00:00 2001 From: Akshaj Verma Date: Sat, 18 Dec 2021 17:28:27 +0530 Subject: [PATCH 3/8] Refactored functions to hooks. --- pytorchvideo/neural_engine/hook.py | 49 +++++++ pytorchvideo/neural_engine/retrieval_hook.py | 128 ++++++------------- pytorchvideo/neural_engine/test.py | 0 3 files changed, 90 insertions(+), 87 deletions(-) delete mode 100644 pytorchvideo/neural_engine/test.py diff --git a/pytorchvideo/neural_engine/hook.py b/pytorchvideo/neural_engine/hook.py index 8cdc2238..cd3d2b37 100644 --- a/pytorchvideo/neural_engine/hook.py +++ b/pytorchvideo/neural_engine/hook.py @@ -72,6 +72,7 @@ def full_decode(status: OrderedDict, **args): decode_audio = args.get("decode_audio", True) video = EncodedVideo.from_path(status["path"], decode_audio, decoder) frames = video.get_clip(0, video.duration) + return frames @@ -87,6 +88,7 @@ def __init__( # Decoding params self.decode_audio = decode_audio self.decoder = decoder + # Hook params self.executor = executor self.inputs = ["path"] @@ -94,6 +96,52 @@ def __init__( self.fail_strategy = fail_strategy self.priority = priority + # frame and bounding-box tracker + self.frame_tracker = [] + + def _populate_frame_tracker(self, model, frames): + """ + Generates a data structure to track bounding boxes and + keypoint coordinates. Useful for extracting the frame-id given + the bounding number from a video for action-recognition. + """ + + for frame_id, frame in enumerate(frames): + model_outputs = model.predict(frame) + + # get bounding-box coordinates (x1, y1, x2, y2) + bboxes_per_frame = ( + model_outputs["instances"][model_outputs["instances"].pred_classes == 0] + .pred_boxes.tensor.to("cpu") + .squeeze() + ) + + # get keypoints (slice to select only the x,y coordinates) + keypoints_per_frame = ( + model_outputs["instances"][model_outputs["instances"].pred_classes == 0] + .pred_keypoints[:, :, :2] + .to("cpu") + ) + + if bboxes_per_frame.shape[0] != keypoints_per_frame.shape[0]: + raise ValueError( + "bboxes_per_frame and keypoints_per_frame should have same 0th dim." + ) + + for i in range(bboxes_per_frame.shape[0]): + bbox_coord = bboxes_per_frame[i, :] + keypoint_per_bbox = keypoints_per_frame[i, :, :] + + bbox_info = { + "frame_id": frame_id, + "bbox_id": i, + "person_id": None, + "bbox_coord": bbox_coord, + "keypoint_coord": keypoint_per_bbox, + } + + self.frame_tracker.append(bbox_info) + def _run( self, status: OrderedDict, @@ -101,6 +149,7 @@ def _run( frames = self.executor( status, decode_audio=self.decode_audio, decoder=self.decoder ) + return frames diff --git a/pytorchvideo/neural_engine/retrieval_hook.py b/pytorchvideo/neural_engine/retrieval_hook.py index f804e51c..87d69306 100644 --- a/pytorchvideo/neural_engine/retrieval_hook.py +++ b/pytorchvideo/neural_engine/retrieval_hook.py @@ -1,104 +1,58 @@ -import cv2 -import torch - - -def process_video(video_path, model, max_frames): - video = cv2.VideoCapture(video_path) - - frame_count = 0 - - frame_tracker = [] - - while True: - is_frame, frame = video.read() +from collections import OrderedDict +from typing import Callable - if not is_frame: - break - - # generate model output - model_outputs = model.predict(frame) - - # get bounding boxes (x1, y1, x2, y2) - bboxes_per_frame = model_outputs["instances"][ - model_outputs["instances"].pred_classes == 0 - ].pred_boxes - bboxes_per_frame = bboxes_per_frame.tensor.to("cpu").squeeze() +import torch +from hook import HookBase - # calculate bbox center (x1, y1, x2, y2) - bboxes_per_frame_center_x = ( - bboxes_per_frame[:, 0] + bboxes_per_frame[:, 2] - ) / 2 # (x1+x2)/2 - bboxes_per_frame_center_y = ( - bboxes_per_frame[:, 1] + bboxes_per_frame[:, 3] - ) / 2 # (y1+y2)/2 - # get keypoints - keypoints_per_frame = model_outputs["instances"][ - model_outputs["instances"].pred_classes == 0 - ].pred_keypoints - keypoints_per_frame = keypoints_per_frame[:, :, :2].to("cpu") +def create_keypoint_features_db(frame_tracker): + return torch.stack([bbox["keypoint_coord"].flatten() for bbox in frame_tracker]) - # change origin of the keypoints to center of each bbox - keypoints_per_frame[:, :, 0] = keypoints_per_frame[ - :, :, 0 - ] - bboxes_per_frame_center_x.unsqueeze(1) - keypoints_per_frame[:, :, 1] = keypoints_per_frame[ - :, :, 1 - ] - bboxes_per_frame_center_y.unsqueeze(1) - for i in range(bboxes_per_frame.shape[0]): - bbox_coord = bboxes_per_frame[i, :] - keypoint_per_bbox = keypoints_per_frame[i, :, :] +def calculate_distance_scores(action_query, keypoint_feature_db): + scores = action_query @ keypoint_feature_db.T + return scores - bbox_info = { - "bbox_id": i, - "person_id": None, - "frame_id": frame_count, - "bbox_coord": bbox_coord, - "keypoint_coord": keypoint_per_bbox, - } - frame_tracker.append(bbox_info) +def get_closest_keypoint_feature_match(scores, method, n): + if method == "topk": + return torch.topk(scores, n).indices.squeeze().tolist() + elif method == "softmax": + score_probs = torch.nn.functional.softmax(scores, dim=1) + return (score_probs > n).squeeze().nonzero().tolist()[0] - frame_count += 1 - if frame_count == max_frames: - break +def bbox_to_frame_executor(frame_tracker, best_bbox_matches): + return [frame_tracker[bbox_id]["frame_id"] for bbox_id in best_bbox_matches] - video.release() - cv2.destroyAllWindows() - return frame_tracker +class PeopleKeypointRetrievalHook(HookBase): + def __init__(self, executor: Callable = bbox_to_frame_executor): + self.executor = executor + self.inputs = ["frame_tracker", "action_query"] + self.outputs = ["frame_id"] + def _run(self, status: OrderedDict): + # extract frame_tracker and action_query feature + frame_tracker = status["frame_tracker"] + action_query = status["action_query"] -def create_keypoint_features_db(frame_tracker): - return torch.stack([bbox["keypoint_coord"].flatten() for bbox in frame_tracker]) + # combine multiple keypoint features into a single tensor + keypoint_feature_db = create_keypoint_features_db(frame_tracker) + # find feature closest to action_query from the keypoint_feature_db + distance_scores = calculate_distance_scores( + action_query=action_query, keypoint_feature_db=keypoint_feature_db + ) -def calculate_distance(action_query, keypoint_db): - scores = action_query @ keypoint_db.T - return scores + # extract the index (bbox_id) of the best matches + best_bbox_match_list = get_closest_keypoint_feature_match( + scores=distance_scores, method="softmax", n=0.9 + ) + # get frame_id_list from the best_bbox_match_list + frame_id_list = self.executor( + frame_tracker=frame_tracker, best_bbox_matches=best_bbox_match_list + ) -def get_closest_matches(scores, method, n): - if method == "topk": - return torch.topk(scores, n).indices.squeeze() - elif method == "softmax": - score_probs = torch.nn.functional.softmax(scores, dim=1) - return (score_probs > n).squeeze().nonzero() - - -## workflow - -# frame_tracker = process_video( -# video_path="./sample_video.mp4", model=keypoint_detection_model, max_frames=5 -# ) -# -# keypoint_db = create_keypoint_features_db(frame_tracker) -# -# scores = calculate_distance(action_query, keypoint_db) -# -# best_bbox_matches = get_closest_matches(scores, method="topk", n=10) -# -# for bbox_id in best_bbox_matches.tolist(): -# print(frame_tracker[bbox_id]["frame_id"]) + return {"frame_id_list": frame_id_list} diff --git a/pytorchvideo/neural_engine/test.py b/pytorchvideo/neural_engine/test.py deleted file mode 100644 index e69de29b..00000000 From c386a858ce7f8de2e71a9efc9e3b294f34d2e071 Mon Sep 17 00:00:00 2001 From: Akshaj Verma Date: Sat, 18 Dec 2021 17:45:58 +0530 Subject: [PATCH 4/8] Center keypoint coordinates per bbox. --- pytorchvideo/neural_engine/hook.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/pytorchvideo/neural_engine/hook.py b/pytorchvideo/neural_engine/hook.py index cd3d2b37..acf11544 100644 --- a/pytorchvideo/neural_engine/hook.py +++ b/pytorchvideo/neural_engine/hook.py @@ -76,6 +76,26 @@ def full_decode(status: OrderedDict, **args): return frames +def center_keypoints_in_bbox(bboxes_per_frame, keypoints_per_frame): + # calculate bbox center (x1, y1, x2, y2) + bboxes_per_frame_center_x = ( + bboxes_per_frame[:, 0] + bboxes_per_frame[:, 2] + ) / 2 # (x1+x2)/2 + bboxes_per_frame_center_y = ( + bboxes_per_frame[:, 1] + bboxes_per_frame[:, 3] + ) / 2 # (y1+y2)/2 + + # change origin of the keypoints to center of each bbox + keypoints_per_frame[:, :, 0] = keypoints_per_frame[ + :, :, 0 + ] - bboxes_per_frame_center_x.unsqueeze(1) + keypoints_per_frame[:, :, 1] = keypoints_per_frame[ + :, :, 1 + ] - bboxes_per_frame_center_y.unsqueeze(1) + + return keypoints_per_frame + + class DecodeHook(HookBase): def __init__( self, @@ -123,11 +143,19 @@ def _populate_frame_tracker(self, model, frames): .to("cpu") ) + # center keypoints wrt to the respective bounding box centers + keypoints_per_frame = center_keypoints_in_bbox( + bboxes_per_frame=bboxes_per_frame, + keypoints_per_frame=keypoints_per_frame, + ) + + # sanity check if bboxes_per_frame.shape[0] != keypoints_per_frame.shape[0]: raise ValueError( "bboxes_per_frame and keypoints_per_frame should have same 0th dim." ) + # append bbox_info to frame_tracker for i in range(bboxes_per_frame.shape[0]): bbox_coord = bboxes_per_frame[i, :] keypoint_per_bbox = keypoints_per_frame[i, :, :] From a0b1dcc3ed774381d5d5921f2ea1c860e9215981 Mon Sep 17 00:00:00 2001 From: Akshaj Verma Date: Sat, 18 Dec 2021 17:56:35 +0530 Subject: [PATCH 5/8] L2 norm before dot product. --- pytorchvideo/neural_engine/retrieval_hook.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pytorchvideo/neural_engine/retrieval_hook.py b/pytorchvideo/neural_engine/retrieval_hook.py index 87d69306..690f6bed 100644 --- a/pytorchvideo/neural_engine/retrieval_hook.py +++ b/pytorchvideo/neural_engine/retrieval_hook.py @@ -10,7 +10,10 @@ def create_keypoint_features_db(frame_tracker): def calculate_distance_scores(action_query, keypoint_feature_db): - scores = action_query @ keypoint_feature_db.T + scores = ( + torch.nn.functional.normalize(action_query) + @ torch.nn.functional.normalize(keypoint_feature_db).T + ) return scores From 958fc2743dc0511250006fe29f81179929fd4461 Mon Sep 17 00:00:00 2001 From: Akshaj Verma Date: Sat, 18 Dec 2021 18:07:20 +0530 Subject: [PATCH 6/8] Run linter. --- pytorchvideo/data/__init__.py | 14 ++++++++------ pytorchvideo/data/clip_sampling.py | 2 +- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/pytorchvideo/data/__init__.py b/pytorchvideo/data/__init__.py index ab21e663..f08faa05 100644 --- a/pytorchvideo/data/__init__.py +++ b/pytorchvideo/data/__init__.py @@ -2,12 +2,6 @@ from .ava import Ava # noqa from .charades import Charades # noqa -from .clip_sampling import ( # noqa; noqa - ClipSampler, - RandomClipSampler, - UniformClipSampler, - make_clip_sampler, -) from .domsev import DomsevFrameDataset, DomsevVideoDataset # noqa from .epic_kitchen_forecasting import EpicKitchenForecasting # noqa from .epic_kitchen_recognition import EpicKitchenRecognition # noqa @@ -16,3 +10,11 @@ from .labeled_video_dataset import LabeledVideoDataset, labeled_video_dataset # noqa from .ssv2 import SSv2 from .ucf101 import Ucf101 # noqa + + +from .clip_sampling import ( # noqa; noqa + ClipSampler, + RandomClipSampler, + UniformClipSampler, + make_clip_sampler, +) diff --git a/pytorchvideo/data/clip_sampling.py b/pytorchvideo/data/clip_sampling.py index a11b2b97..fc4ba235 100644 --- a/pytorchvideo/data/clip_sampling.py +++ b/pytorchvideo/data/clip_sampling.py @@ -3,7 +3,7 @@ import random from abc import ABC, abstractmethod from fractions import Fraction -from typing import Any, Dict, NamedTuple, Optional, Tuple, Union, List +from typing import Any, Dict, List, NamedTuple, Optional, Tuple, Union class ClipInfo(NamedTuple): From e126c8c05dd41a3886277a5c044b7913fc2167a6 Mon Sep 17 00:00:00 2001 From: Akshaj Verma Date: Mon, 20 Dec 2021 09:25:31 +0530 Subject: [PATCH 7/8] Replace dot product with cosine similarity. --- pytorchvideo/neural_engine/retrieval_hook.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/pytorchvideo/neural_engine/retrieval_hook.py b/pytorchvideo/neural_engine/retrieval_hook.py index 690f6bed..1f3c4548 100644 --- a/pytorchvideo/neural_engine/retrieval_hook.py +++ b/pytorchvideo/neural_engine/retrieval_hook.py @@ -10,19 +10,16 @@ def create_keypoint_features_db(frame_tracker): def calculate_distance_scores(action_query, keypoint_feature_db): - scores = ( - torch.nn.functional.normalize(action_query) - @ torch.nn.functional.normalize(keypoint_feature_db).T - ) + scores = torch.nn.functional.cosine_similarity(action_query, keypoint_feature_db) return scores def get_closest_keypoint_feature_match(scores, method, n): if method == "topk": - return torch.topk(scores, n).indices.squeeze().tolist() + return torch.topk(scores, n).indices.tolist() elif method == "softmax": - score_probs = torch.nn.functional.softmax(scores, dim=1) - return (score_probs > n).squeeze().nonzero().tolist()[0] + score_probs = torch.nn.functional.softmax(scores, dim=0) + return (score_probs > n).nonzero().squeeze().tolist() def bbox_to_frame_executor(frame_tracker, best_bbox_matches): From 884838710b18ecb511dcff9b587de8de99b0b577 Mon Sep 17 00:00:00 2001 From: Akshaj Verma Date: Mon, 20 Dec 2021 10:46:17 +0530 Subject: [PATCH 8/8] Populate frame tracker after decoding video. --- pytorchvideo/neural_engine/hook.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pytorchvideo/neural_engine/hook.py b/pytorchvideo/neural_engine/hook.py index acf11544..dc1a3ede 100644 --- a/pytorchvideo/neural_engine/hook.py +++ b/pytorchvideo/neural_engine/hook.py @@ -178,6 +178,9 @@ def _run( status, decode_audio=self.decode_audio, decoder=self.decoder ) + # populate the frame tracker while decoding videos + self._populate_frame_tracker(frames=frames) + return frames