import numpy as np
from sklearn.model_selection import train_test_split


X, y = np.arange(10).reshape((5, 2)), range(5)
X,y

(array([[0, 1],
        [2, 3],
        [4, 5],
        [6, 7],
        [8, 9]]),
 range(0, 5))


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
X_train, X_test, y_train, y_test

(array([[4, 5],
        [0, 1],
        [6, 7]]),
 array([[2, 3],
        [8, 9]]),
 [2, 0, 3],
 [1, 4])

{
    "info": info, 
    "images": [image], 
    "annotations": [annotation], 
    "licenses": [license],
}
info{
    "year": int, 
    "version": str, 
    "description": str, 
    "contributor": str, 
    "url": str, 
    "date_created": datetime,
}
image{
    "id": int, 
    "width": int,
    "height": int,
    "file_name": str,
    "license": int,
    "flickr_url": str,
    "coco_url": str,
    "date_captured": datetime,
}
license{
    "id": int, 
    "name": str, 
    "url": str,
}

{
    "info": {
        "description": "COCO 2017 Dataset",
        "url": "http://cocodataset.org",
        "version": "1.0","year": 2017,
        "contributor": "COCO Consortium",
        "date_created": "2017/09/01"
    },
    "licenses": [
        {"url": "http://creativecommons.org/licenses/by/2.0/","id": 4,"name": "Attribution License"}
    ],
    "images": [
        {"id": 242287, "license": 4, 
         "coco_url": "http://images.cocodataset.org/val2017/xxxxxxxxxxxx.jpg",
         "flickr_url": "http://farm3.staticflickr.com/2626/xxxxxxxxxxxx.jpg",
         "width": 426, "height": 640, "file_name": "xxxxxxxxx.jpg", 
         "date_captured": "2013-11-15 02:41:42"},
        {"id": 245915, "license": 4, 
         "coco_url": "http://images.cocodataset.org/val2017/nnnnnnnnnnnn.jpg",
         "flickr_url": "http://farm1.staticflickr.com/88/xxxxxxxxxxxx.jpg", 
         "width": 640, "height": 480, "file_name": "nnnnnnnnnn.jpg",
         "date_captured": "2013-11-18 02:53:27"}
    ],
    "annotations": [
        {"id": 125686, "category_id": 0, "iscrowd": 0, 
         "segmentation": [[164.81, 417.51,......167.55, 410.64]], 
         "image_id": 242287, "area": 42061.80340000001, 
         "bbox": [19.23, 383.18, 314.5, 244.46]},
        {"id": 1409619, "category_id": 0, "iscrowd": 0, 
         "segmentation": [[376.81, 238.8,........382.74, 241.17]], 
         "image_id": 245915, "area": 3556.2197000000015, 
         "bbox": [399, 251, 155, 101]},
        {"id": 1410165, "category_id": 1, "iscrowd": 0, 
         "segmentation": [[486.34, 239.01,..........495.95, 244.39]], 
         "image_id": 245915, "area": 1775.8932499999994, 
         "bbox": [86, 65, 220, 334]}
    ],
    "categories": [
        {"supercategory": "speaker","id": 0,"name": "echo"},
        {"supercategory": "speaker","id": 1,"name": "echo dot"}
    ]
}

Data is all you need¶

Miroslav Jiřík¶

Goal¶

How to split dataset¶

Training Dataset¶

Validation Dataset¶

Test Dataset¶

References¶

Datasets¶

MNIST¶

References¶

ImageNet¶

References¶

COCO¶

COCO json¶

Resources¶

Few other datasets...¶

Skin cancer dataset HAM10000¶

Liver Segmentation 3D-IRCADb-01¶

Annotation tools¶

CVAT¶

Via annotation tool¶

End to end training¶

Deep learning¶

References¶

Metacentrum¶