@phdthesis{Starcevic2020, type = {Master Thesis}, author = {Nikolai Starcevic}, title = {Approaches for dealing with structural- and rounded zeros in data mining tasks}, doi = {10.25924/opus-3703}, pages = {70}, year = {2020}, abstract = {Zeros can cause many issues in data analysis and dealing with them requires specialized procedures. We differentiate between rounded zeros, structural zeros and missing values. Rounded zeros occur when the true value of a variable is hidden because of a detection limit in whatever mechanism was used to acquire the data. Structural zeros are values which are truly zero, often coming about due to a hidden mechanism separate from the one which generates values greater than 0. Missing values are values that are completely missing for unknown or known reasons. This thesis outlines various methods for dealing with different kinds of zeros in different contexts. Many of these methods are very specific in their ideal usecase. They are separated based on which kind of zero they are intended for and if they are better suited for compositional or for standard data. For rounded zeros we impute the zeros with an estimated value below the detection limit. The author describes multiplicative replacement, a simple procedure that imputes values at a fixed fraction of the detection limit. As a more advanced technique, the author describes Kaplan Meier smoothing spline replacement, which interpolates a spline on a Kaplan Meier curve and uses the spline below the detection limit to impute values in a more natural distribution. Rounded zeros cannot be imputed with the same techniques that would be used for regular missing values, since there is more information available on the true value of a rounded zero than there would be for a regular missing value. Structural zeros cannot be imputed since they are a true zero. Imputing them would falsify their values and produce a value where there should be none. Because of this, we apply modelling techniques that can work around structural zeros and incorporate them. For standard data, the zero inflated Poisson model is presented. This model utilizes a mixture of a logistic and a Poisson distribution to accurately model data with a large amount of structural zeros. While the Poisson distribution is only applicable to count data, the zero inflation concept can be applied to different kinds of distributions. For compositional data, the zero adjusted Dirichlet model is introduced. This model mixes Dirichlet distributions for every pattern of zeros found within the data. Non-algorithmic techniques to reduce the amount of structural zeros present are also shown. These techniques being amalgamation, which combines columns with structural zeros into more broad descriptors and classification, which changes columns into categorical values based on a structural zero being present or not. Missing values are values that are completely missing for various known or unknown reasons. Different imputation techniques are introduced. For standard data, MissForest imputation is introduced, which utilizes a RandomForest regression to impute mixed type missing values. Another imputation technique shown utilizes both a genetic algorithm and a neural network to impute values based on the genetic algorithm minimizing the error of an autoencoder neural network. In the case of compositional data, knn imputation is presented, which utilizes the knn concept also found in knn clustering to impute the values based on the closest samples with a value available. All of these methods are explained and demonstrated to give readers a guide to finding the suitable methods to use in different scenarios. The thesis also provides a general guide on dealing with zeros in data, with decision flowcharts and more detailed descriptions for both compositional and standard data being presented. General tips on getting better results when zeros are involved are also given and explained. This general guide was then applied to a dataset to show it in action.}, language = {en} }