-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdatasets.json
More file actions
649 lines (649 loc) · 28 KB
/
datasets.json
File metadata and controls
649 lines (649 loc) · 28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
{
"items": [
{
"id": 1768741495407,
"title": "MegaDepth",
"year": 2018,
"description": "A large-scale dataset of 196 outdoor scenes reconstructed from 1 million internet photos using multi-view stereo (MVS). Provides ordinal depth relations, metric depth maps, and camera poses for tasks like single-view depth prediction and visual localization.",
"keywords": [
"image",
"depth",
"pose",
"outdoor"
],
"link": "https://www.cs.cornell.edu/projects/megadepth/",
"createdAt": "2026-01-18T13:04:55.407Z"
},
{
"id": 1770644875879,
"title": "RoboSpatial",
"year": 2025,
"description": "A large-scale spatial understanding dataset for robotics containing over 3M annotated images with rich spatial information including quantitative spatial relationships, affordances, and multi-granularity 2D/3D spatial Q&A pairs, designed to teach VLMs spatial reasoning for robotic manipulation.",
"keywords": [
"images",
"depths",
"pose",
"questions",
"robotics"
],
"link": "https://github.com/NVlabs/RoboSpatial",
"createdAt": "2026-02-09T13:47:55.879Z"
},
{
"id": 1770848561824,
"title": "AnnyOne",
"year": 2026,
"description": "A large-scale synthetic dataset from Naver Labs featuring photorealistic multi-view renderings of diverse human meshes in indoor environments. Provides ground-truth 3D body meshes, depth maps, and camera parameters for human pose/shape estimation and novel-view synthesis.",
"keywords": [
"Human",
"mesh",
"synthetic",
"indoor",
"multi-view"
],
"link": "https://europe.naverlabs.com/research/human-centric-computer-vision/anny-one/",
"createdAt": "2026-02-11T22:22:41.824Z"
},
{
"id": 1770890678661,
"title": "RealEstate10K",
"year": 2018,
"description": "A dataset of 10 million frames derived from about 80,000 YouTube video clips of real estate walkthroughs. Each clip is annotated with camera intrinsics and poses obtained via SLAM, making it widely used for novel view synthesis, depth estimation, and multi-view geometry tasks.",
"keywords": [
"image",
"depth",
"pose"
],
"link": "https://google.github.io/realestate10k/",
"createdAt": "2026-02-12T10:04:38.661Z"
},
{
"id": 1770891623440,
"title": "DyCheck",
"year": 2022,
"description": "DyCheck is a dataset of dynamic scenes with ground truth depth maps and camera poses. It contains three subsets from different sources: HyperNerf, iPhone, and Nerfies.",
"keywords": [
"image",
"depth",
"pose",
"dynamic"
],
"link": "https://kair-bair.github.io/dycheck/",
"createdAt": "2026-02-12T10:20:23.440Z"
},
{
"id": 1770891722588,
"title": "Kubric",
"year": 2020,
"description": "Kubric is a dataset of multi-view images with camera poses, depth maps, and dynamic objects.",
"keywords": [
"image",
"depth",
"pose",
"dynamic"
],
"link": "https://github.com/google-research/kubric",
"createdAt": "2026-02-12T10:22:02.588Z"
},
{
"id": 1770891804399,
"title": "DL3DV10K",
"year": 2024,
"description": "10,510 multi-view scenes covering 51.2 million frames at 4k resolution.\n140 videos as Novel view synthesis (NVS) benchmark.\nAll videos are annotated by scene environment (indoor vs. outdoor), levels of reflection, transparency, and lighting.\nReleased samples include colmap calculated camera pose.",
"keywords": [
"image",
"depth",
"pose"
],
"link": "https://github.com/DL3DV-10K/Dataset",
"createdAt": "2026-02-12T10:23:24.399Z"
},
{
"id": 1770891977281,
"title": "ETH3D",
"year": 2019,
"description": "It contains both high and low resolution images for SLAM and Muliview Stereo",
"keywords": [
"image",
"depths",
"pose"
],
"link": "https://www.eth3d.net/",
"createdAt": "2026-02-12T10:26:17.281Z"
},
{
"id": 1770892241460,
"title": "Dynamic Replica",
"year": 2023,
"description": "It has about 145200 stereo frames (524 videos) with humans and animals in motion.",
"keywords": [
"image",
"depth",
"pose",
"scene flow",
"instance mask",
"foreground-background mask",
"long-range pixel trajectories"
],
"link": "https://github.com/facebookresearch/dynamic_stereo",
"createdAt": "2026-02-12T10:30:41.460Z"
},
{
"id": 1770892309424,
"title": "BlendedMVS",
"year": 2020,
"description": "a large-scale MVS dataset for generalized multi-view stereo networks. The dataset contains 17k MVS training samples covering a variety of 113 scenes, including architectures, sculptures and small objects.",
"keywords": [
"image",
"depths",
"pose"
],
"link": "https://github.com/YoYo000/BlendedMVS",
"createdAt": "2026-02-12T10:31:49.424Z"
},
{
"id": 1770892401970,
"title": "ARKitScenes",
"year": 2026,
"description": "ARKitScenes is the first RGB-D dataset captured with the widely available Apple LiDAR scanner. Along with the raw data we provide the camera pose and surface reconstruction for each scene.\n ARKitScenes is the largest indoor 3D dataset consisting of 5,047 captures of 1,661 unique scenes.\nWe provide high quality ground truth of (a) registered RGB-D frames and (b) oriented bounding boxes of room defining objects.",
"keywords": [
"image",
"depths",
"pose",
"3D bbox"
],
"link": "https://github.com/apple/ARKitScenes",
"createdAt": "2026-02-12T10:33:21.970Z"
},
{
"id": 1770892541974,
"title": "Cubify Anything",
"year": 2025,
"description": "This work is related to ARKitScenes. We generally share the same underlying captures. Some notable differences in CA-1M:\n\nEach scene has been exhaustively annotated with class-agnostic 3D boxes. We release these in the laser scanner's coordinate frame.\nFor each frame in each capture, we include \"per-frame\" 3D box ground-truth which was produced using the rendering process outlined in the Cubify Anything paper. These annotations are, therefore, independent of any pose.\nSome other nice things:\n\nWe release the GT poses (registered to laser scanner) for every frame in each capture.\nWe release the GT depth (rendered from laser scanner) at 512 x 384 for every frame in each capture.\nEach frame has been already oriented into an upright position.",
"keywords": [
"image",
"depth",
"pose",
"3D bbox"
],
"link": "https://github.com/apple/ml-cubifyanything",
"createdAt": "2026-02-12T10:35:41.974Z"
},
{
"id": 1770892678475,
"title": "WildRGBD",
"year": 2024,
"description": "a large-scale collection of 3D object data captured in real-world settings, containing nearly\n20,000 RGB-D videos of approximately 8,500 objects across 46 categories. The data was gathered using an iPhone moving\n360 degrees around the objects.",
"keywords": [
"image",
"depth",
"pose",
"object mask"
],
"link": "https://github.com/wildrgbd/wildrgbd",
"createdAt": "2026-02-12T10:37:58.475Z"
},
{
"id": 1770892821800,
"title": "Hypersim",
"year": 2021,
"description": "A photorealistic synthetic dataset for holistic indoor scene understanding. It leverages a large repository of synthetic\nscenes created by professional artists, and generate 77,400 images of 461 indoor scenes with detailed per-pixel labels\nand corresponding ground truth geometry.",
"keywords": [
"image",
"depth",
"pose",
"3D bbox",
"mesh",
"semantic"
],
"link": "https://github.com/apple/ml-hypersim",
"createdAt": "2026-02-12T10:40:21.800Z"
},
{
"id": 1770892998165,
"title": "ScanNet",
"year": 2015,
"description": "an RGB-D video dataset containing 2.5 million views in more than 1500 scans, annotated with 3D camera poses,\nsurface reconstructions, and instance-level semantic segmentations.",
"keywords": [
"image",
"depth",
"pose",
"instance"
],
"link": "https://scan-net.org/",
"createdAt": "2026-02-12T10:43:18.165Z"
},
{
"id": 1770893084236,
"title": "ScanNet++",
"year": 2023,
"description": "a large scale dataset with 1000+ 3D indoor scenes containing sub-millimeter resolution laser scans, registered 33-megapixel DSLR images, and commodity RGB-D streams from iPhone. The 3D reconstructions are annotated with long-tail and label-ambiguous semantics to benchmark semantic understanding methods, while the coupled DSLR and iPhone captures enable benchmarking of novel view synthesis methods in high-quality and commodity settings.",
"keywords": [
"image",
"depth",
"pose",
"3D semantic",
"3D instance"
],
"link": "https://scannetpp.mlsg.cit.tum.de/scannetpp/",
"createdAt": "2026-02-12T10:44:44.236Z"
},
{
"id": 1770893174010,
"title": "Mapillary Planet-scale Depth Dataset (MPSD)",
"year": 2026,
"description": "a diverse street-level imagery dataset with metric depth information for outdoor metric depth estimation\ncontaining 750,000 images extracted from over 50,000 individual 3D reconstructions captured by a broad range of camera\ntypes with different focal lengths.",
"keywords": [
"image",
"depth",
"pose",
"outdoor"
],
"link": "https://www.mapillary.com/dataset/depth",
"createdAt": "2026-02-12T10:46:14.010Z"
},
{
"id": 1770893286096,
"title": "Mapillary Metropolis",
"year": 2021,
"description": "27,745 high-resolution 360° images with human-curated annotations\n3D point clouds from: aerial and street-level LIDAR, Structure-from-Motion and Multiview-Stereo reconstructions, geo-anchored based on high-precision, survey-grade ground control points\nFull aerial image cover with 7.5 cm/px resolution\nManually labeled 2D / 3D object annotations for up to 39 semantic categories\nHuman annotated aerial-to-ground correspondences\nRegistered CAD models, machine-generated panoptic segmentation masks and more...",
"keywords": [
"image",
"depth",
"pose",
"2D semantic",
"3D semantic"
],
"link": "https://www.mapillary.com/dataset/metropolis",
"createdAt": "2026-02-12T10:48:06.096Z"
},
{
"id": 1770893447244,
"title": "MVS-Synth",
"year": 2018,
"description": "a photo-realistic synthetic dataset prepared for learning-based Multi-View Stereo algorithms. It consists of 120 sequences, each with 100 frames of urban scenes captured in the video game Grand Theft Auto V.",
"keywords": [
"image",
"depths",
"pose",
"synthetic"
],
"link": "https://phuang17.github.io/DeepMVS/mvs-synth.html",
"createdAt": "2026-02-12T10:50:47.244Z"
},
{
"id": 1770893580610,
"title": "DL3DV",
"year": 2025,
"description": "CUT3R authors used COLMAP MVS to undistort images and obtain dense depth. They only pre-processed the 1K-7K\nsubset, so only ~6200 scenes of the total ~10K scenes are available.",
"keywords": [
"image",
"depth",
"pose"
],
"link": "https://github.com/CUT3R/CUT3R/blob/main/docs/preprocess.md#dl3dv",
"createdAt": "2026-02-12T10:53:00.610Z"
},
{
"id": 1770893711250,
"title": "Mid-Air",
"year": 2019,
"description": "Mid-Air, The Montefiore Institute Dataset of Aerial Images and Records, is a multi-purpose synthetic dataset for low altitude drone flights. It provides a large amount of synchronized data corresponding to flight records for multi-modal vision sensors and navigation sensors mounted on board of a flying quadcopter. Our multi-modal vision sensors capture RGB pictures, relative surface normal orientation, depth, object semantics and stereo disparity.",
"keywords": [
"image",
"depth",
"pose",
"imu",
"drone"
],
"link": "https://midair.ulg.ac.be/",
"createdAt": "2026-02-12T10:55:11.250Z"
},
{
"id": 1770893867020,
"title": "Dedlam",
"year": 2023,
"description": "The following synthetic image assets are available for download:\n10450 image sequences, 30fps, 1280x720\nimages (PNG)\ndepth maps (EXR/32-bit)\nsegmentation masks (PNG)\nseparate binary masks for subject body/clothing/hair and environment\nmovies (MP4)\nground truth for all sequences (CSV)\nThe following body and clothing related assets are available for download:\nbody textures and clothing overlay textures\nclothing assets\nSMPL-X animation files",
"keywords": [
"image",
"depth",
"pose",
"instance",
"point track",
"human"
],
"link": "https://bedlam.is.tue.mpg.de/",
"createdAt": "2026-02-12T10:57:47.020Z"
},
{
"id": 1770894027399,
"title": "Bedlam2",
"year": 2025,
"description": "Synthetic image data:\n27480 image sequences, 30fps, 1280x720\n8 million images (PNG, 11TB)\nmovies (MP4/H.264, 160GB)\ncamera and body ground truth for all sequences (CSV+JSON)\ndepth maps (EXR/16-bit, available for 44% of images, 15TB)\nRender assets:\nbody textures\nclothing assets\nanimation files for SMPL-X model (locked head, no head bun)\nstrand-based hair grooms\nshoes (represented as displacement maps)",
"keywords": [
"image",
"depth",
"pose",
"human",
"optical flow",
"3D/4D tracking",
"dynamic",
"human"
],
"link": "https://bedlam2.is.tuebingen.mpg.de/",
"createdAt": "2026-02-12T11:00:27.399Z"
},
{
"id": 1770894154255,
"title": "TartanAir V2",
"year": 2026,
"description": "a large-scale, photorealistic simulation environment designed to train and benchmark visual\nSLAM. It provides challenging camera trajectories through diverse worlds, weather, and lighting conditions, with\nprecise ground truth data for depth, optical flow, and segmentation.",
"keywords": [
"image",
"depth",
"pose",
"optical flow",
"segmentation"
],
"link": "https://tartanair.org/",
"createdAt": "2026-02-12T11:02:34.255Z"
},
{
"id": 1770894430583,
"title": "ParallelDomain-4D",
"year": 2026,
"description": "These ~1500 scenes were provided by the ParallelDomain engine, and contain photorealistic driving scenarios with diverse environments, traffic patterns, vehicles, pedestrians, and weather conditions. Each scene contains synchronized videos from 19 camera viewpoints (3 ego, 16 surround) and 50 frames at a resolution of 640 x 480 and a frame rate of 10 FPS. The cameras follow the car at the center of each scene precisely. The basic modalities are: RGB, depth, semantic segmentation, instance segmentation, and 2D bounding boxes. The additional modalities are: LiDAR point clouds, optical flow, scene flow, and surface normals.",
"keywords": [
"image",
"depth",
"pose",
"semantic",
"instance",
"2D bbox"
],
"link": "https://gcd.cs.columbia.edu/#datasets",
"createdAt": "2026-02-12T11:07:10.583Z"
},
{
"id": 1770894525040,
"title": "Kubric-4D",
"year": 2026,
"description": "These 3000 scenes were generated with the Kubric simulator, and contain multi-object interactions with rich visual appearance and complicated dynamics. Each scene contains synchronized videos from 16 fixed camera viewpoints (4 high, 12 low) and 60 frames at a resolution of 576 x 384 and a frame rate of 24 FPS. The available modalities include: RGB, depth, optical flow, object coordinates, surface normals, and instance segmentation",
"keywords": [
"image",
"depth",
"pose",
"optical flow",
"normal",
"instance"
],
"link": "https://gcd.cs.columbia.edu/#datasets",
"createdAt": "2026-02-12T11:08:45.040Z"
},
{
"id": 1770894606385,
"title": "7Scenes",
"year": 2013,
"description": "Microsoft 7-Scenes is a RGB-D dataset featuring seven indoor scenes captured with a Kinect camera. The dataset includes RGB images (640x480), depth maps, and camera-to-world poses. Depth values are provided in metric scale (millimetres converted to metres).",
"keywords": [
"image",
"depth",
"pose",
"indoor"
],
"link": "https://www.microsoft.com/en-us/research/project/rgb-d-dataset-7-scenes/",
"createdAt": "2026-02-12T11:10:06.385Z"
},
{
"id": 1770894697597,
"title": "CODv2",
"year": 2026,
"description": "The CO3D dataset contains a total of 1.5 million frames from nearly 19,000 videos capturing objects from 50 MS-COCO categories.\nIt providesreal multi-view images of object categories annotated with camera poses and ground-truth 3D point clouds.",
"keywords": [
"image",
"depth",
"pose",
"object mask"
],
"link": "https://github.com/facebookresearch/co3d?tab=readme-ov-file#new-features-in-co3dv2",
"createdAt": "2026-02-12T11:11:37.597Z"
},
{
"id": 1770894775396,
"title": "Pointodyssey",
"year": 2026,
"description": "a synthetic dataset designed for point tracking algorithms. It features 159 videos, each averaging 2,000 frames and ~20k annotated 3d point tracks per sequence, with deformable characters animated using real-world motion capture data. The dataset includes diverse 3D scenes with randomized object and character appearances.\nData includes RGB images, cameras (intrinsics and extrinsics), depth information as well as 2D and 3D point tracks with visibility.\nImage resolution: 540 x 960",
"keywords": [
"image",
"depth",
"pose",
"2D track",
"3D track"
],
"link": "https://pointodyssey.com/",
"createdAt": "2026-02-12T11:12:55.396Z"
},
{
"id": 1770904118696,
"title": "Waymo Open Dataset",
"year": 2020,
"description": "The dataset, containing an unlabeled mixture of data collected in both manually-driven and autonomously-driven modes, is composed of 103,354 segments each containing 20 seconds of object tracks at 10Hz and map data for the area covered by the segment. These segments are further broken into 9 second windows (1 second of history and 8 seconds of future data) with varying overlap.",
"keywords": [
"image",
"lidar",
"pose",
"3D bbox"
],
"link": "https://waymo.com/open/data/motion/",
"createdAt": "2026-02-12T13:48:38.696Z"
},
{
"id": 1770904342237,
"title": "Habitat-Matterport 3D (HM3D)",
"year": 2021,
"description": "The Habitat-Matterport 3D Research Dataset (HM3D) is the largest-ever dataset of 3D indoor spaces. It consists of 1,000 high-resolution 3D scans (or digital twins) of building-scale residential, commercial, and civic spaces generated from real-world environments.",
"keywords": [
"image",
"depth",
"pose"
],
"link": "https://aihabitat.org/datasets/hm3d/",
"createdAt": "2026-02-12T13:52:22.237Z"
},
{
"id": 1770904530723,
"title": "Aria Synthetic Environments (ASE)",
"year": 2024,
"description": "100,000 unique multi-room interior scenes\nSimulated with realistic device trajectories\nAcross ~2-minute trajectories\nPopulated with ~8000 3D objects\nWith semi-dense map representations\nNumber of scenes: 100K\nNumber of images: 58M+\nTrajectories\nTotal time: 67 days\nTotal distance: London -> San Francisco(7800 km)\nRooms: Up to 5 complex Manhattan rooms\nAll surfaces in the world are aligned with three dominant directions, typically corresponding to the X, Y, and Z axes\nDataset size: ~23TB\nUndistort tool https://github.com/google-deepmind/tapnet/blob/main/tapnet/tapvid3d/annotation_generation/adt_utils.py",
"keywords": [
"image",
"depth",
"pose",
"2D instance",
"3D floor plan",
"synthetic"
],
"link": "https://facebookresearch.github.io/projectaria_tools/docs/open_datasets/aria_synthetic_environments_dataset",
"createdAt": "2026-02-12T13:55:30.723Z"
},
{
"id": 1770905220309,
"title": "Taskonomy",
"year": 2018,
"description": "Complete pixel-level geometric information via aligned meshes.\nSemantic information via knowledge distillation from ImageNet, MS COCO, and MIT Places.\nGlobally consistent camera poses. Complete camera intrinsics.",
"keywords": [
"image",
"depth",
"pose",
"2D semantic"
],
"link": "http://taskonomy.stanford.edu/",
"createdAt": "2026-02-12T14:07:00.309Z"
},
{
"id": 1770905458182,
"title": "MegaSynth",
"year": 2024,
"description": "3D dataset comprising 700K scenes (which takes only 3 days to generate) - 70 times larger than the prior real dataset DL3DV - dramatically scaling the training data.",
"keywords": [
"image",
"depth",
"pose"
],
"link": "https://hwjiang1510.github.io/MegaSynth/",
"createdAt": "2026-02-12T14:10:58.182Z"
},
{
"id": 1770905585509,
"title": "VKitti2",
"year": 2020,
"description": "",
"keywords": [
"image",
"depth",
"pose",
"2D instance",
"2D semantic",
"optical flow",
"synthetic"
],
"link": "https://europe.naverlabs.com/proxy-virtual-worlds-vkitti-2/",
"createdAt": "2026-02-12T14:13:05.509Z"
},
{
"id": 1770906015723,
"title": "Objaverse-XL",
"year": 2023,
"description": "Objaverse-XL is an open dataset of over 10 million 3D objects!",
"keywords": [
"image",
"depth",
"pose"
],
"link": "https://objaverse.allenai.org/",
"createdAt": "2026-02-12T14:20:15.723Z"
},
{
"id": 1770906169088,
"title": "Omniobject3D",
"year": 2023,
"description": "OmniObject3D has several appealing properties:\n1) Large Vocabulary: It comprises 6,000 scanned objects in 190 daily categories, sharing common classes with popular 2D datasets (e.g., ImageNet and LVIS), benefiting the pursuit of generalizable 3D representations.\n2) Rich Annotations: Each 3D object is captured with both 2D and 3D sensors, providing textured meshes, point clouds, multi-view rendered images, and multiple real-captured videos.\n3) Realistic Scans: The professional scanners support high-quality object scans with precise shapes and realistic appearances.",
"keywords": [
"image",
"depth",
"pose",
"mesh",
"object"
],
"link": "https://omniobject3d.github.io/",
"createdAt": "2026-02-12T14:22:49.088Z"
},
{
"id": 1770906314767,
"title": "OmniWorld",
"year": 2026,
"description": "📊 Massive Scale: 4000+ hours, 600K+ sequences, 300M+ frames\n🤖 Diverse Domains: sourced from simulator, robot, human & the Internet\n🎨 Rich Multi-Modality: depth maps, camera poses, text captions, optical flow & foreground mask",
"keywords": [
"image",
"depth",
"pose",
"object mask",
"foreground mask",
"flow",
"text"
],
"link": "https://github.com/yangzhou24/OmniWorld",
"createdAt": "2026-02-12T14:25:14.767Z"
},
{
"id": 1770906704959,
"title": "Trellis",
"year": 2025,
"description": "TRELLIS-500K is a large-scale object-centric dataset containing 500K 3D assets curated from Objaverse(XL), ABO, 3D-FUTURE, HSSD, and Toys4k, filtered based on aesthetic scores.",
"keywords": [
"image",
"depth",
"pose",
"object",
"mesh"
],
"link": "https://github.com/microsoft/TRELLIS",
"createdAt": "2026-02-12T14:31:44.959Z"
},
{
"id": 1770906892836,
"title": "AriaDigitalTwin",
"year": 2023,
"description": "Dataset Content\n200 sequences (~400 mins)\n398 objects (324 stationary, 74 dynamic)\n2 real indoor scenes\nSingle + multi-user activities\nSensor Data per device\n2 x outward-facing monochrome camera streams\n1 x outward-facing RGB camera stream\n2 x IMU streams\n2 x Internal-facing eye tracking cameras\nComplete sensor calibrations\nAnnotations\n6DoF device trajectory\n3D object pose\n3D human skeleton\n3D eye gaze\n2D Photo-realistic synthetic rendering\n2D bounding box\n2D instance segmentation\n2D depth map",
"keywords": [
"image",
"depth",
"pose",
"2D bbox",
"2D instance",
"human",
"multicamera",
"imu"
],
"link": "https://www.projectaria.com/datasets/adt/",
"createdAt": "2026-02-12T14:34:52.836Z"
},
{
"id": 1770907109314,
"title": "MegaScenes",
"year": 2024,
"description": "The MegaScenes Dataset is an extensive collection of around 430K scenes and 9M images and epipolar geometries, featuring over 100K structure-from-motion reconstructions from 2M of these images. The images of these scenes are captured under varying conditions, including different times of day, various weather and illumination, and from different devices with distinct camera intrinsics.",
"keywords": [
"image",
"sparse pointclouds",
"pose"
],
"link": "https://megascenes.github.io/",
"createdAt": "2026-02-12T14:38:29.314Z"
},
{
"id": 1772806354733,
"title": "Sekai-Real-HQ",
"year": 2025,
"description": "1. High-quality and diverse video. All videos are recorded in 720p, featuring diverse weather, various times, and dynamic scenes.\n\n2. Worldwide location. Videos span 100 countries and regions, showcasing 750+ cities with diverse cultures, activities, and landscapes.\n\n3. Walking and drone view. Beyond walking videos, Seikai includes drone view (FPV and UAV) videos for unrestricted world exploration.\n\n4. Long duration. All walking videos are at least 60 seconds long, ensuring real-world, long-term world exploration.\n5. Rich annotations. All videos are annotated with location, scene, weather, crowd density, captions, and camera trajectories. YouTube videos' annotations are of high quality, while annotations from the game are considered ground truth.",
"keywords": [
"image",
"depth",
"pose",
"weather",
"scene",
"caption"
],
"link": "https://lixsp11.github.io/sekai-project/",
"createdAt": "2026-03-06T14:12:34.733Z"
},
{
"id": 1772806523048,
"title": "SpatialVID: A Large-Scale Video Dataset with Spatial Annotations",
"year": 2026,
"description": "We introduce SpatialVID, a large-scale video dataset with explicit spatial annotations including camera poses, depth maps, structured captions and serialized motion instructions. The dataset consists of 7,089 hours of real-world dynamic scenes.",
"keywords": [
"image",
"depth",
"pose",
"caption"
],
"link": "https://nju-3dv.github.io/projects/SpatialVID/",
"createdAt": "2026-03-06T14:15:23.048Z"
},
{
"id": 1773137798401,
"title": "InsScene-15K",
"year": 2026,
"link": "https://lifuguan.github.io/IGGT_official/",
"description": "15k scenes, 200M images, constructed from Synthetic Data (e.g., Aria, Infinigen), Real-World Video Capture (e.g., RE10K), and Real-World RGBD Capture (e.g., ScanNet++)",
"keywords": [
"image",
"depth",
"pose",
"3D instance"
],
"createdAt": "2026-03-10T10:16:38.401Z"
}
]
}