@phdthesis{Velte2015, type = {Master Thesis}, author = {Maurice Velte}, title = {Semantic Image Segmentation Combining Visible and Near-Infrared Channels with Depth Information}, doi = {10.13140/RG.2.1.2921.3929}, pages = {114}, year = {2015}, abstract = {Image understanding is a vital task in computer vision that has many applications in areas such as robotics, surveillance and the automobile industry. An important precondition for image understanding is semantic image segmentation, i.e. the correct labeling of every image pixel with its corresponding object name or class. This thesis proposes a machine learning approach for semantic image segmentation that uses images from a multi-modal camera rig. It demonstrates that semantic segmentation can be improved by combining different image types as inputs to a convolutional neural network (CNN), when compared to a single-image approach. In this work a multi-channel near-infrared (NIR) image, an RGB image and a depth map are used. The detection of people is further improved by using a skin image that indicates the presence of human skin in the scene and is computed based on NIR information. It is also shown that segmentation accuracy can be enhanced by using a class voting method based on a superpixel pre-segmentation. Models are trained for 10-class, 3-class and binary classification tasks using an original dataset. Compared to the NIR-only approach, average class accuracy is increased by 7\% for 10-class, and by 22\% for 3-class classification, reaching a total of 48\% and 70\% accuracy, respectively. The binary classification task, which focuses on the detection of people, achieves a classification accuracy of 95\% and true positive rate of 66\%. The report at hand describes the proposed approach and the encountered challenges and shows that a CNN can successfully learn and combine features from multi-modal image sets and use them to predict scene labeling.}, language = {en} }