paper-notes/datasets.json at dev · feixue94/paper-notes

649 lines (649 loc) · 28 KB
  "items": [
      "id": 1768741495407,
      "title": "MegaDepth",
      "year": 2018,
      "description": "A large-scale dataset of 196 outdoor scenes reconstructed from 1 million internet photos using multi-view stereo (MVS). Provides ordinal depth relations, metric depth maps, and camera poses for tasks like single-view depth prediction and visual localization.",
      "keywords": [
        "image",
        "depth",
        "pose",
        "outdoor"
      "link": "https://www.cs.cornell.edu/projects/megadepth/",
      "createdAt": "2026-01-18T13:04:55.407Z"
      "id": 1770644875879,
      "title": "RoboSpatial",
      "year": 2025,
      "description": "A large-scale spatial understanding dataset for robotics containing over 3M annotated images with rich spatial information including quantitative spatial relationships, affordances, and multi-granularity 2D/3D spatial Q&A pairs, designed to teach VLMs spatial reasoning for robotic manipulation.",
      "keywords": [
        "images",
        "depths",
        "pose",
        "questions",
        "robotics"
      "link": "https://github.com/NVlabs/RoboSpatial",
      "createdAt": "2026-02-09T13:47:55.879Z"
      "id": 1770848561824,
      "title": "AnnyOne",
      "year": 2026,
      "description": "A large-scale synthetic dataset from Naver Labs featuring photorealistic multi-view renderings of diverse human meshes in indoor environments. Provides ground-truth 3D body meshes, depth maps, and camera parameters for human pose/shape estimation and novel-view synthesis.",
      "keywords": [
        "Human",
        "mesh",
        "synthetic",
        "indoor",
        "multi-view"
      "link": "https://europe.naverlabs.com/research/human-centric-computer-vision/anny-one/",
      "createdAt": "2026-02-11T22:22:41.824Z"
      "id": 1770890678661,
      "title": "RealEstate10K",
      "year": 2018,
      "description": "A dataset of 10 million frames derived from about 80,000 YouTube video clips of real estate walkthroughs. Each clip is annotated with camera intrinsics and poses obtained via SLAM, making it widely used for novel view synthesis, depth estimation, and multi-view geometry tasks.",
      "keywords": [
        "image",
        "depth",
        "pose"
      "link": "https://google.github.io/realestate10k/",
      "createdAt": "2026-02-12T10:04:38.661Z"
      "id": 1770891623440,
      "title": "DyCheck",
      "year": 2022,
      "description": "DyCheck is a dataset of dynamic scenes with ground truth depth maps and camera poses. It contains three subsets from different sources: HyperNerf, iPhone, and Nerfies.",
      "keywords": [
        "image",
        "depth",
        "pose",
        "dynamic"
      "link": "https://kair-bair.github.io/dycheck/",
      "createdAt": "2026-02-12T10:20:23.440Z"
      "id": 1770891722588,
      "title": "Kubric",
      "year": 2020,
      "description": "Kubric is a dataset of multi-view images with camera poses, depth maps, and dynamic objects.",
      "keywords": [
        "image",
        "depth",
        "pose",
        "dynamic"
      "link": "https://github.com/google-research/kubric",
      "createdAt": "2026-02-12T10:22:02.588Z"
      "id": 1770891804399,
      "title": "DL3DV10K",
      "year": 2024,
      "description": "10,510 multi-view scenes covering 51.2 million frames at 4k resolution.\n140 videos as Novel view synthesis (NVS) benchmark.\nAll videos are annotated by scene environment (indoor vs. outdoor), levels of reflection, transparency, and lighting.\nReleased samples include colmap calculated camera pose.",
      "keywords": [
        "image",
        "depth",
        "pose"
      "link": "https://github.com/DL3DV-10K/Dataset",
      "createdAt": "2026-02-12T10:23:24.399Z"
      "id": 1770891977281,
      "title": "ETH3D",
      "year": 2019,
      "description": "It contains both high and low resolution images for SLAM and Muliview Stereo",
      "keywords": [
        "image",
        "depths",
        "pose"
      "link": "https://www.eth3d.net/",
      "createdAt": "2026-02-12T10:26:17.281Z"
      "id": 1770892241460,
      "title": "Dynamic Replica",
      "year": 2023,
      "description": "It has about 145200 stereo frames (524 videos) with humans and animals in motion.",
      "keywords": [
        "image",
        "depth",
        "pose",
        "scene flow",
        "instance mask",
        "foreground-background mask",
        "long-range pixel trajectories"
      "link": "https://github.com/facebookresearch/dynamic_stereo",
      "createdAt": "2026-02-12T10:30:41.460Z"
      "id": 1770892309424,
      "title": "BlendedMVS",
      "year": 2020,
      "description": "a large-scale MVS dataset for generalized multi-view stereo networks. The dataset contains 17k MVS training samples covering a variety of 113 scenes, including architectures, sculptures and small objects.",
      "keywords": [
        "image",
        "depths",
        "pose"
      "link": "https://github.com/YoYo000/BlendedMVS",
      "createdAt": "2026-02-12T10:31:49.424Z"
      "id": 1770892401970,
      "title": "ARKitScenes",
      "year": 2026,
      "description": "ARKitScenes is the first RGB-D dataset captured with the widely available Apple LiDAR scanner. Along with the raw data we provide the camera pose and surface reconstruction for each scene.\n ARKitScenes is the largest indoor 3D dataset consisting of 5,047 captures of 1,661 unique scenes.\nWe provide high quality ground truth of (a) registered RGB-D frames and (b) oriented bounding boxes of room defining objects.",
      "keywords": [
        "image",
        "depths",
        "pose",
        "3D bbox"
      "link": "https://github.com/apple/ARKitScenes",
      "createdAt": "2026-02-12T10:33:21.970Z"
      "id": 1770892541974,
      "title": "Cubify Anything",
      "year": 2025,
      "description": "This work is related to ARKitScenes. We generally share the same underlying captures. Some notable differences in CA-1M:\n\nEach scene has been exhaustively annotated with class-agnostic 3D boxes. We release these in the laser scanner's coordinate frame.\nFor each frame in each capture, we include \"per-frame\" 3D box ground-truth which was produced using the rendering process outlined in the Cubify Anything paper. These annotations are, therefore, independent of any pose.\nSome other nice things:\n\nWe release the GT poses (registered to laser scanner) for every frame in each capture.\nWe release the GT depth (rendered from laser scanner) at 512 x 384 for every frame in each capture.\nEach frame has been already oriented into an upright position.",
      "keywords": [
        "image",
        "depth",
        "pose",
        "3D bbox"
      "link": "https://github.com/apple/ml-cubifyanything",
      "createdAt": "2026-02-12T10:35:41.974Z"
      "id": 1770892678475,
      "title": "WildRGBD",
      "year": 2024,
      "description": "a large-scale collection of 3D object data captured in real-world settings, containing nearly\n20,000 RGB-D videos of approximately 8,500 objects across 46 categories. The data was gathered using an iPhone moving\n360 degrees around the objects.",
      "keywords": [
        "image",
        "depth",
        "pose",
        "object mask"
      "link": "https://github.com/wildrgbd/wildrgbd",
      "createdAt": "2026-02-12T10:37:58.475Z"
      "id": 1770892821800,
      "title": "Hypersim",
      "year": 2021,
      "description": "A photorealistic synthetic dataset for holistic indoor scene understanding. It leverages a large repository of synthetic\nscenes created by professional artists, and generate 77,400 images of 461 indoor scenes with detailed per-pixel labels\nand corresponding ground truth geometry.",
      "keywords": [
        "image",
        "depth",
        "pose",
        "3D bbox",
        "mesh",
        "semantic"
      "link": "https://github.com/apple/ml-hypersim",
      "createdAt": "2026-02-12T10:40:21.800Z"
      "id": 1770892998165,
      "title": "ScanNet",
      "year": 2015,
      "description": "an RGB-D video dataset containing 2.5 million views in more than 1500 scans, annotated with 3D camera poses,\nsurface reconstructions, and instance-level semantic segmentations.",
      "keywords": [
        "image",
        "depth",
        "pose",
        "instance"
      "link": "https://scan-net.org/",
      "createdAt": "2026-02-12T10:43:18.165Z"
      "id": 1770893084236,
      "title": "ScanNet++",
      "year": 2023,
      "description": "a large scale dataset with 1000+ 3D indoor scenes containing sub-millimeter resolution laser scans, registered 33-megapixel DSLR images, and commodity RGB-D streams from iPhone. The 3D reconstructions are annotated with long-tail and label-ambiguous semantics to benchmark semantic understanding methods, while the coupled DSLR and iPhone captures enable benchmarking of novel view synthesis methods in high-quality and commodity settings.",
      "keywords": [
        "image",
        "depth",
        "pose",
        "3D semantic",
        "3D instance"
      "link": "https://scannetpp.mlsg.cit.tum.de/scannetpp/",
      "createdAt": "2026-02-12T10:44:44.236Z"
      "id": 1770893174010,
      "title": "Mapillary Planet-scale Depth Dataset (MPSD)",
      "year": 2026,
      "description": "a diverse street-level imagery dataset with metric depth information for outdoor metric depth estimation\ncontaining 750,000 images extracted from over 50,000 individual 3D reconstructions captured by a broad range of camera\ntypes with different focal lengths.",
      "keywords": [
        "image",
        "depth",
        "pose",
        "outdoor"
      "link": "https://www.mapillary.com/dataset/depth",
      "createdAt": "2026-02-12T10:46:14.010Z"
      "id": 1770893286096,
      "title": "Mapillary Metropolis",
      "year": 2021,
      "description": "27,745 high-resolution 360° images with human-curated annotations\n3D point clouds from: aerial and street-level LIDAR, Structure-from-Motion and Multiview-Stereo reconstructions, geo-anchored based on high-precision, survey-grade ground control points\nFull aerial image cover with 7.5 cm/px resolution\nManually labeled 2D / 3D object annotations for up to 39 semantic categories\nHuman annotated aerial-to-ground correspondences\nRegistered CAD models, machine-generated panoptic segmentation masks and more...",
      "keywords": [
        "image",
        "depth",
        "pose",
        "2D semantic",
        "3D semantic"
      "link": "https://www.mapillary.com/dataset/metropolis",
      "createdAt": "2026-02-12T10:48:06.096Z"
      "id": 1770893447244,
      "title": "MVS-Synth",
      "year": 2018,
      "description": "a photo-realistic synthetic dataset prepared for learning-based Multi-View Stereo algorithms. It consists of 120 sequences, each with 100 frames of urban scenes captured in the video game Grand Theft Auto V.",
      "keywords": [
        "image",
        "depths",
        "pose",
        "synthetic"
      "link": "https://phuang17.github.io/DeepMVS/mvs-synth.html",
      "createdAt": "2026-02-12T10:50:47.244Z"
      "id": 1770893580610,
      "title": "DL3DV",
      "year": 2025,
      "description": "CUT3R authors used COLMAP MVS to undistort images and obtain dense depth. They only pre-processed the 1K-7K\nsubset, so only ~6200 scenes of the total ~10K scenes are available.",
      "keywords": [
        "image",
        "depth",
        "pose"
      "link": "https://github.com/CUT3R/CUT3R/blob/main/docs/preprocess.md#dl3dv",
      "createdAt": "2026-02-12T10:53:00.610Z"
      "id": 1770893711250,
      "title": "Mid-Air",
      "year": 2019,
      "description": "Mid-Air, The Montefiore Institute Dataset of Aerial Images and Records, is a multi-purpose synthetic dataset for low altitude drone flights. It provides a large amount of synchronized data corresponding to flight records for multi-modal vision sensors and navigation sensors mounted on board of a flying quadcopter. Our multi-modal vision sensors capture RGB pictures, relative surface normal orientation, depth, object semantics and stereo disparity.",
      "keywords": [
        "image",
        "depth",
        "pose",
        "imu",
        "drone"
      "link": "https://midair.ulg.ac.be/",
      "createdAt": "2026-02-12T10:55:11.250Z"
      "id": 1770893867020,
      "title": "Dedlam",
      "year": 2023,
      "description": "The following synthetic image assets are available for download:\n10450 image sequences, 30fps, 1280x720\nimages (PNG)\ndepth maps (EXR/32-bit)\nsegmentation masks (PNG)\nseparate binary masks for subject body/clothing/hair and environment\nmovies (MP4)\nground truth for all sequences (CSV)\nThe following body and clothing related assets are available for download:\nbody textures and clothing overlay textures\nclothing assets\nSMPL-X animation files",
      "keywords": [
        "image",
        "depth",
        "pose",
        "instance",
        "point track",
        "human"
      "link": "https://bedlam.is.tue.mpg.de/",
      "createdAt": "2026-02-12T10:57:47.020Z"
      "id": 1770894027399,
      "title": "Bedlam2",
      "year": 2025,
      "description": "Synthetic image data:\n27480 image sequences, 30fps, 1280x720\n8 million images (PNG, 11TB)\nmovies (MP4/H.264, 160GB)\ncamera and body ground truth for all sequences (CSV+JSON)\ndepth maps (EXR/16-bit, available for 44% of images, 15TB)\nRender assets:\nbody textures\nclothing assets\nanimation files for SMPL-X model (locked head, no head bun)\nstrand-based hair grooms\nshoes (represented as displacement maps)",
      "keywords": [
        "image",
        "depth",
        "pose",
        "human",
        "optical flow",
        "3D/4D tracking",
        "dynamic",
        "human"
      "link": "https://bedlam2.is.tuebingen.mpg.de/",
      "createdAt": "2026-02-12T11:00:27.399Z"
      "id": 1770894154255,
      "title": "TartanAir V2",
      "year": 2026,
      "description": "a large-scale, photorealistic simulation environment designed to train and benchmark visual\nSLAM. It provides challenging camera trajectories through diverse worlds, weather, and lighting conditions, with\nprecise ground truth data for depth, optical flow, and segmentation.",
      "keywords": [
        "image",
        "depth",
        "pose",
        "optical flow",
        "segmentation"
      "link": "https://tartanair.org/",
      "createdAt": "2026-02-12T11:02:34.255Z"
      "id": 1770894430583,
      "title": "ParallelDomain-4D",
      "year": 2026,
      "description": "These ~1500 scenes were provided by the ParallelDomain engine, and contain photorealistic driving scenarios with diverse environments, traffic patterns, vehicles, pedestrians, and weather conditions. Each scene contains synchronized videos from 19 camera viewpoints (3 ego, 16 surround) and 50 frames at a resolution of 640 x 480 and a frame rate of 10 FPS. The cameras follow the car at the center of each scene precisely. The basic modalities are: RGB, depth, semantic segmentation, instance segmentation, and 2D bounding boxes. The additional modalities are: LiDAR point clouds, optical flow, scene flow, and surface normals.",
      "keywords": [
        "image",
        "depth",
        "pose",
        "semantic",
        "instance",
        "2D bbox"
      "link": "https://gcd.cs.columbia.edu/#datasets",
      "createdAt": "2026-02-12T11:07:10.583Z"
      "id": 1770894525040,
      "title": "Kubric-4D",
      "year": 2026,
      "description": "These 3000 scenes were generated with the Kubric simulator, and contain multi-object interactions with rich visual appearance and complicated dynamics. Each scene contains synchronized videos from 16 fixed camera viewpoints (4 high, 12 low) and 60 frames at a resolution of 576 x 384 and a frame rate of 24 FPS. The available modalities include: RGB, depth, optical flow, object coordinates, surface normals, and instance segmentation",
      "keywords": [
        "image",
        "depth",
        "pose",
        "optical flow",
        "normal",
        "instance"
      "link": "https://gcd.cs.columbia.edu/#datasets",
      "createdAt": "2026-02-12T11:08:45.040Z"
      "id": 1770894606385,
      "title": "7Scenes",
      "year": 2013,
      "description": "Microsoft 7-Scenes is a RGB-D dataset featuring seven indoor scenes captured with a Kinect camera. The dataset includes RGB images (640x480), depth maps, and camera-to-world poses. Depth values are provided in metric scale (millimetres converted to metres).",
      "keywords": [
        "image",
        "depth",
        "pose",
        "indoor"
      "link": "https://www.microsoft.com/en-us/research/project/rgb-d-dataset-7-scenes/",
      "createdAt": "2026-02-12T11:10:06.385Z"
      "id": 1770894697597,
      "title": "CODv2",
      "year": 2026,
      "description": "The CO3D dataset contains a total of 1.5 million frames from nearly 19,000 videos capturing objects from 50 MS-COCO categories.\nIt providesreal multi-view images of object categories annotated with camera poses and ground-truth 3D point clouds.",
      "keywords": [
        "image",
        "depth",
        "pose",
        "object mask"
      "link": "https://github.com/facebookresearch/co3d?tab=readme-ov-file#new-features-in-co3dv2",
      "createdAt": "2026-02-12T11:11:37.597Z"
      "id": 1770894775396,
      "title": "Pointodyssey",
      "year": 2026,
      "description": "a synthetic dataset designed for point tracking algorithms. It features 159 videos, each averaging 2,000 frames and ~20k annotated 3d point tracks per sequence, with deformable characters animated using real-world motion capture data. The dataset includes diverse 3D scenes with randomized object and character appearances.\nData includes RGB images, cameras (intrinsics and extrinsics), depth information as well as 2D and 3D point tracks with visibility.\nImage resolution: 540 x 960",
      "keywords": [
        "image",
        "depth",
        "pose",
        "2D track",
        "3D track"
      "link": "https://pointodyssey.com/",
      "createdAt": "2026-02-12T11:12:55.396Z"
      "id": 1770904118696,
      "title": "Waymo Open Dataset",
      "year": 2020,
      "description": "The dataset, containing an unlabeled mixture of data collected in both manually-driven and autonomously-driven modes, is composed of 103,354 segments each containing 20 seconds of object tracks at 10Hz and map data for the area covered by the segment. These segments are further broken into 9 second windows (1 second of history and 8 seconds of future data) with varying overlap.",
      "keywords": [
        "image",
        "lidar",
        "pose",
        "3D bbox"
      "link": "https://waymo.com/open/data/motion/",
      "createdAt": "2026-02-12T13:48:38.696Z"
      "id": 1770904342237,
      "title": "Habitat-Matterport 3D (HM3D)",
      "year": 2021,
      "description": "The Habitat-Matterport 3D Research Dataset (HM3D) is the largest-ever dataset of 3D indoor spaces. It consists of 1,000 high-resolution 3D scans (or digital twins) of building-scale residential, commercial, and civic spaces generated from real-world environments.",
      "keywords": [
        "image",
        "depth",
        "pose"
      "link": "https://aihabitat.org/datasets/hm3d/",
      "createdAt": "2026-02-12T13:52:22.237Z"
      "id": 1770904530723,
      "title": "Aria Synthetic Environments (ASE)",
      "year": 2024,
      "description": "100,000 unique multi-room interior scenes\nSimulated with realistic device trajectories\nAcross ~2-minute trajectories\nPopulated with ~8000 3D objects\nWith semi-dense map representations\nNumber of scenes: 100K\nNumber of images: 58M+\nTrajectories\nTotal time: 67 days\nTotal distance: London -> San Francisco(7800 km)\nRooms: Up to 5 complex Manhattan rooms\nAll surfaces in the world are aligned with three dominant directions, typically corresponding to the X, Y, and Z axes\nDataset size: ~23TB\nUndistort tool https://github.com/google-deepmind/tapnet/blob/main/tapnet/tapvid3d/annotation_generation/adt_utils.py",
      "keywords": [
        "image",
        "depth",
        "pose",
        "2D instance",
        "3D floor plan",
        "synthetic"
      "link": "https://facebookresearch.github.io/projectaria_tools/docs/open_datasets/aria_synthetic_environments_dataset",
      "createdAt": "2026-02-12T13:55:30.723Z"
      "id": 1770905220309,
      "title": "Taskonomy",
      "year": 2018,
      "description": "Complete pixel-level geometric information via aligned meshes.\nSemantic information via knowledge distillation from ImageNet, MS COCO, and MIT Places.\nGlobally consistent camera poses. Complete camera intrinsics.",
      "keywords": [
        "image",
        "depth",
        "pose",
        "2D semantic"
      "link": "http://taskonomy.stanford.edu/",
      "createdAt": "2026-02-12T14:07:00.309Z"
      "id": 1770905458182,
      "title": "MegaSynth",
      "year": 2024,
      "description": "3D dataset comprising 700K scenes (which takes only 3 days to generate) - 70 times larger than the prior real dataset DL3DV - dramatically scaling the training data.",
      "keywords": [
        "image",
        "depth",
        "pose"
      "link": "https://hwjiang1510.github.io/MegaSynth/",
      "createdAt": "2026-02-12T14:10:58.182Z"
      "id": 1770905585509,
      "title": "VKitti2",
      "year": 2020,
      "description": "",
      "keywords": [
        "image",
        "depth",
        "pose",
        "2D instance",
        "2D semantic",
        "optical flow",
        "synthetic"
      "link": "https://europe.naverlabs.com/proxy-virtual-worlds-vkitti-2/",
      "createdAt": "2026-02-12T14:13:05.509Z"
      "id": 1770906015723,
      "title": "Objaverse-XL",
      "year": 2023,
      "description": "Objaverse-XL is an open dataset of over 10 million 3D objects!",
      "keywords": [
        "image",
        "depth",
        "pose"
      "link": "https://objaverse.allenai.org/",
      "createdAt": "2026-02-12T14:20:15.723Z"
      "id": 1770906169088,
      "title": "Omniobject3D",
      "year": 2023,
      "description": "OmniObject3D has several appealing properties:\n1) Large Vocabulary: It comprises 6,000 scanned objects in 190 daily categories, sharing common classes with popular 2D datasets (e.g., ImageNet and LVIS), benefiting the pursuit of generalizable 3D representations.\n2) Rich Annotations: Each 3D object is captured with both 2D and 3D sensors, providing textured meshes, point clouds, multi-view rendered images, and multiple real-captured videos.\n3) Realistic Scans: The professional scanners support high-quality object scans with precise shapes and realistic appearances.",
      "keywords": [
        "image",
        "depth",
        "pose",
        "mesh",
        "object"
      "link": "https://omniobject3d.github.io/",
      "createdAt": "2026-02-12T14:22:49.088Z"
      "id": 1770906314767,
      "title": "OmniWorld",
      "year": 2026,
      "description": "📊 Massive Scale: 4000+ hours, 600K+ sequences, 300M+ frames\n🤖 Diverse Domains: sourced from simulator, robot, human & the Internet\n🎨 Rich Multi-Modality: depth maps, camera poses, text captions, optical flow & foreground mask",
      "keywords": [
        "image",
        "depth",
        "pose",
        "object mask",
        "foreground mask",
        "flow",
        "text"
      "link": "https://github.com/yangzhou24/OmniWorld",
      "createdAt": "2026-02-12T14:25:14.767Z"
      "id": 1770906704959,
      "title": "Trellis",
      "year": 2025,
      "description": "TRELLIS-500K is a large-scale object-centric dataset containing 500K 3D assets curated from Objaverse(XL), ABO, 3D-FUTURE, HSSD, and Toys4k, filtered based on aesthetic scores.",
      "keywords": [
        "image",
        "depth",
        "pose",
        "object",
        "mesh"
      "link": "https://github.com/microsoft/TRELLIS",
      "createdAt": "2026-02-12T14:31:44.959Z"
      "id": 1770906892836,
      "title": "AriaDigitalTwin",
      "year": 2023,
      "description": "Dataset Content\n200 sequences (~400 mins)\n398 objects (324 stationary, 74 dynamic)\n2 real indoor scenes\nSingle + multi-user activities\nSensor Data per device\n2 x outward-facing monochrome camera streams\n1 x outward-facing RGB camera stream\n2 x IMU streams\n2 x Internal-facing eye tracking cameras\nComplete sensor calibrations\nAnnotations\n6DoF device trajectory\n3D object pose\n3D human skeleton\n3D eye gaze\n2D Photo-realistic synthetic rendering\n2D bounding box\n2D instance segmentation\n2D depth map",
      "keywords": [
        "image",
        "depth",
        "pose",
        "2D bbox",
        "2D instance",
        "human",
        "multicamera",
        "imu"
      "link": "https://www.projectaria.com/datasets/adt/",
      "createdAt": "2026-02-12T14:34:52.836Z"
      "id": 1770907109314,
      "title": "MegaScenes",
      "year": 2024,
      "description": "The MegaScenes Dataset is an extensive collection of around 430K scenes and 9M images and epipolar geometries, featuring over 100K structure-from-motion reconstructions from 2M of these images. The images of these scenes are captured under varying conditions, including different times of day, various weather and illumination, and from different devices with distinct camera intrinsics.",
      "keywords": [
        "image",
        "sparse pointclouds",
        "pose"
      "link": "https://megascenes.github.io/",
      "createdAt": "2026-02-12T14:38:29.314Z"
      "id": 1772806354733,
      "title": "Sekai-Real-HQ",
      "year": 2025,
      "description": "1. High-quality and diverse video. All videos are recorded in 720p, featuring diverse weather, various times, and dynamic scenes.\n\n2. Worldwide location. Videos span 100 countries and regions, showcasing 750+ cities with diverse cultures, activities, and landscapes.\n\n3. Walking and drone view. Beyond walking videos, Seikai includes drone view (FPV and UAV) videos for unrestricted world exploration.\n\n4. Long duration. All walking videos are at least 60 seconds long, ensuring real-world, long-term world exploration.\n5. Rich annotations. All videos are annotated with location, scene, weather, crowd density, captions, and camera trajectories. YouTube videos' annotations are of high quality, while annotations from the game are considered ground truth.",
      "keywords": [
        "image",
        "depth",
        "pose",
        "weather",
        "scene",
        "caption"
      "link": "https://lixsp11.github.io/sekai-project/",
      "createdAt": "2026-03-06T14:12:34.733Z"
      "id": 1772806523048,
      "title": "SpatialVID: A Large-Scale Video Dataset with Spatial Annotations",
      "year": 2026,
      "description": "We introduce SpatialVID, a large-scale video dataset with explicit spatial annotations including camera poses, depth maps, structured captions and serialized motion instructions. The dataset consists of 7,089 hours of real-world dynamic scenes.",
      "keywords": [
        "image",
        "depth",
        "pose",
        "caption"
      "link": "https://nju-3dv.github.io/projects/SpatialVID/",
      "createdAt": "2026-03-06T14:15:23.048Z"
      "id": 1773137798401,
      "title": "InsScene-15K",
      "year": 2026,
      "link": "https://lifuguan.github.io/IGGT_official/",
      "description": "15k scenes, 200M images, constructed from Synthetic Data (e.g., Aria, Infinigen), Real-World Video Capture (e.g., RE10K), and Real-World RGBD Capture (e.g., ScanNet++)",
      "keywords": [
        "image",
        "depth",
        "pose",
        "3D instance"
      "createdAt": "2026-03-10T10:16:38.401Z"
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

datasets.json

Latest commit

History

datasets.json

File metadata and controls