Learn how to create datasets, upload files, connect cloud storage, enable multimodal indexing, and manage dataset operations in Labellerr using the Python SDK.
A dataset in Labellerr is a standalone collection of files (images, videos, audio, documents, or text) that can be created independently and attached to one or multiple projects. This modular approach allows you to:
Reuse the same dataset across multiple annotation projects
Manage your data separately from project configurations
Connect cloud storage (AWS S3, Google Cloud Storage) for seamless data access
from labellerr.client import LabellerrClientfrom labellerr.core.schemas import DatasetConfigfrom labellerr.core.datasets import create_dataset_from_localclient = LabellerrClient( api_key='your_api_key', api_secret='your_api_secret', client_id='your_client_id')dataset = create_dataset_from_local( client=client, dataset_config=DatasetConfig( dataset_name="My Image Dataset", dataset_description="A collection of images for object detection", data_type="image" ), folder_to_upload="path/to/your/image/folder")print(f"Dataset created with ID: {dataset.dataset_id}")print(f"Total files: {dataset.files_count}")
Limitations:
Maximum of 2,500 files per folder
Total folder size should not exceed 2.5 GB
Local uploads are slower as files must be transferred through your machine to cloud storage. For large-scale datasets, use cloud storage connections (AWS S3/GCS) for faster direct access.
Create Dataset with File List
Create Dataset with File List
from labellerr.client import LabellerrClientfrom labellerr.core.schemas import DatasetConfigfrom labellerr.core.datasets import create_dataset_from_localclient = LabellerrClient( api_key='your_api_key', api_secret='your_api_secret', client_id='your_client_id')dataset = create_dataset_from_local( client=client, dataset_config=DatasetConfig( dataset_name="Curated Image Dataset", dataset_description="Specific images selected for annotation", data_type="image" ), files_to_upload=["path/to/image1.jpg", "path/to/image2.jpg", "path/to/image3.png"])print(f"Dataset created with ID: {dataset.dataset_id}")
Use this feature when new files are added to your cloud storage bucket and you want to make them available in your Labellerr dataset without creating a new dataset.
from labellerr.client import LabellerrClientfrom labellerr.core.schemas import DatasetConfigfrom labellerr.core.datasets import create_dataset_from_local, LabellerrDatasetclient = LabellerrClient( api_key='your_api_key', api_secret='your_api_secret', client_id='your_client_id')# Step 1: Create dataset with local filesdataset = create_dataset_from_local( client=client, dataset_config=DatasetConfig( dataset_name="Production Image Dataset", dataset_description="High-quality images for production annotation", data_type="image" ), folder_to_upload="path/to/images")# Step 2: Wait for dataset processing to completeprint(f"Dataset ID: {dataset.dataset_id}")dataset.status() # Wait for dataset to be readyprint(f"Files uploaded: {dataset.files_count}")# Step 3: Enable multimodal indexingindexing_result = dataset.enable_multimodal_indexing(is_multimodal=True)print(f"Multimodal indexing enabled: {indexing_result}")# Step 4: Fetch files for verificationfiles = dataset.fetch_files()print(f"Total files in dataset: {len(files)}")# Now this dataset can be attached to one or more projectsprint(f"Dataset {dataset.dataset_id} is ready to be used in projects!")
Always test your connection before creating datasets to catch permission issues early:
Test Connection
from labellerr.client import LabellerrClientfrom labellerr.core.connectors import LabellerrConnectionfrom labellerr.core.schemas import ConnectionType, DatasetDataType# Initialize clientclient = LabellerrClient(api_key, api_secret, client_id)# Get your existing connectionconnection = LabellerrConnection(client=client, connection_id="your_connection_id")# Test the connection on your specific pathtest_result = connection.test( path="s3://your-bucket/path/to/data/", # or gs:// for GCS connection_type=ConnectionType._IMPORT, data_type=DatasetDataType.image)print(f"Connection test: {test_result}")