-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocessing.py
108 lines (97 loc) · 4.04 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import os
import numpy as np
from tensorflow.python.keras.preprocessing.image import load_img, img_to_array
def get_signname_dict():
import csv
with open("res\\signnames.csv") as csv_file:
reader = csv.reader(csv_file, delimiter=",")
i = 0
result = {}
for line in reader:
if i > 0:
result[i - 1] = line[1]
i += 1
return result
signnames = get_signname_dict()
def process_image(img, img_size, grayscale=False):
"""
Loads an image from file, resize to img_size and return as numpy array.
Values are normalized to range [0,1].
:param img: image path
:param img_size: image size as tuple (width, height)
:return: ndarray (width, height, 3) or (width, height)
"""
if grayscale == True:
img = load_img(img, target_size=img_size, color_mode='grayscale')
else:
img = load_img(img, target_size=img_size, color_mode='rgb')
img = img_to_array(img)
img = img/255
img = img.clip(0, 1)
return img
def get_dataset(train_path, test_path, test_labels_path, img_size, grayscale=False):
"""
Builds the dataset from file.
:param grayscale: set True to use grayscale images
:param train_path: directory for training images
:param test_path: directory for test images
:param test_labels_path: directory for class labels of the test set
:param img_size image size (with, height)
:return: subsets splitted in train test and images and labels.
"""
# read training set and labels
file_name_list = []
class_labels = []
for (dirpath, dirnames, filenames) in os.walk(train_path):
for directory in dirnames:
# print(os.path.join(dirpath, directory))
splitted = os.path.join(dirpath, directory).split("\\")
# class labels from 0 to 42 as int instead of "00000" to "00042"
label = int(splitted[len(splitted) - 1])
class_labels.append(label)
for root, directories, filenames in os.walk(os.path.join(dirpath, directory)):
for filename in filenames:
filepath = os.path.join(root, filename)
if filename.endswith('.ppm'):
file_name_list.append(filepath)
# build training set
xtrain = []
ytrain = []
for file_name in file_name_list:
image = process_image(file_name, img_size, grayscale)
xtrain.append(image)
splitted = file_name.split("\\")
label = splitted[len(splitted) - 2]
ytrain.append(label)
xtrain = np.array(xtrain)
ytrain = np.array(ytrain, dtype=int)
print("number of classes: {}".format(len(class_labels)))
print("shape of train images: {}".format(xtrain.shape))
print("shape of labels: {}".format(ytrain.shape))
# build test set
x = os.path.join(os.getcwd(), test_path)
file_name_list_test = list(filter(lambda f : f.endswith(".ppm"), os.listdir(x)))
print("number of test images: {}".format(len(file_name_list_test)))
xtest = np.zeros((len(file_name_list_test), xtrain.shape[1], xtrain.shape[2], xtrain.shape[3]))
ytest = np.zeros((len(file_name_list_test)), dtype=int)
import csv
label_dict = {} # (file_name, class label)
with open(test_labels_path) as csv_file:
reader = csv.reader(csv_file, delimiter=';')
i = 0
for line in reader:
if i == 0:
i += 1
else:
label_dict[line[0]] = line[7]
i += 1
for ix, file in enumerate(file_name_list_test):
xtest[ix] = process_image(os.path.join(test_path, file), (64, 64), grayscale)
ytest[ix] = int(label_dict[file.split(".")[0]])
print("shape of test images: {}".format(xtest.shape))
print("shape of test labels: {}".format(ytest.shape))
# shuffle data with fixed random state for retraining and reproducibility
from sklearn.utils import shuffle
xtrain, ytrain = shuffle(xtrain, ytrain, random_state=25)
xtest, ytest = shuffle(xtest, ytest, random_state=25)
return xtrain, ytrain, xtest, ytest