Hi, I’m a bit confused about the annotations in the Future Hands Prediction benchmark. Are the bounding boxes exact?
For example, in the annotation from v1/annotations/fho_hands_train.json
:
"clips": [
...
{
"clip_id": 121,
"clip_uid": "4f68183f-610a-44de-b102-e7f300b49dcd",
"video_uid": "26202090-684d-4be8-b3cc-de04da827e91",
"frames": [
{
"action_start_sec": 30.654361933333334,
"action_end_sec": 38.654361933333334,
"action_start_frame": 919,
"action_end_frame": 1159,
"action_clip_start_sec": 30.654361933333334,
"action_clip_end_sec": 38.654361933333334,
"action_clip_start_frame": 919,
"action_clip_end_frame": 1159,
"pre_45": {
"frame": 984,
"clip_frame": 984,
"boxes": [
{
"left_hand": [
103.80088888888889,
145.34637037037035
]
}
]
},
...
And here’s the same annotation in v2/annotations/fho_main.json
:
{
"clip_id": "121",
"clip_uid": "4f68183f-610a-44de-b102-e7f300b49dcd",
"start_sec": 0.0,
"end_sec": 300.0,
"clip_parent_start_sec": 0.0,
"clip_parent_end_sec": 308.0,
"narrated_actions": [
...
{
"warnings": [],
"uid": "3b867f4f-958b-4735-aecd-5984998658a7",
"start_sec": 30.654361933333334,
"end_sec": 38.654361933333334,
"start_frame": 919,
"end_frame": 1159,
"is_valid_action": true,
"is_partial": false,
"clip_start_sec": 30.654361933333334,
"clip_end_sec": 38.654361933333334,
"clip_start_frame": 919,
"clip_end_frame": 1159,
...
"critical_frames": {
"pre_45": 984,
"pre_30": 999,
"pre_15": 1014,
"post_frame": 1063,
"contact_frame": 1037,
"pre_frame": 1029,
"pnr_frame": 1049
},
"clip_critical_frames": {
"pre_45": 984,
"pre_30": 999,
"pre_15": 1014,
"post_frame": 1063,
"contact_frame": 1037,
"pre_frame": 1029,
"pnr_frame": 1049
},
"frames": [
{
"frame_number": 984,
"frame_type": "pre_45",
"boxes": [
{
"object_type": "left_hand",
"structured_noun": null,
"instance_number": 1,
"bbox": {
"x": 384.23,
"y": 562.93,
"width": 107.36,
"height": 100.5
},
freeform_noun": null,
"_structured_noun": null,
"_freeform_noun": null,
"unsure_noun": false
},
It seems like the annotation for the “left hand” in the same frame differs between the two versions, “103.80088888888889, 145.34637037037035” and “384.23, 562.93”.
Additionally, I’ve noticed that all the bounding box annotations in the Future Hands Prediction benchmark appear scaled down compared to annotations in other files like sta.json
. They look much smaller in comparison.
Could anyone explain why this is happening?