Skip to contents

Data

In the first step load unprocessed data set “raw_example_data”. This data set represents a sample of metabolomic data and exhibits common issues such as missing values and duplicates. Additionally, the current format does not adhere to the specifications required by MeTEor.

# read in CSV file
data("raw_example_data")
head(raw_example_data[, 1:8], n = 5)
#> # A tibble: 5 × 8
#>      id treatment   sex  comp metabolite_1 metabolite_2  time metabolite_3
#>   <dbl>     <dbl> <dbl> <dbl>        <dbl>        <dbl> <dbl>        <dbl>
#> 1     1         0     0     1        1.88         0.750     1       -2.23 
#> 2     1         0     0     1        2.22        -0.167     2       -1.05 
#> 3     1         0     0     1        1.85         0.221     3       -0.681
#> 4     1         0     0     1        0.339        1.94      4       -2.36 
#> 5     1         0     0     1        1.16         1.20      5       -2.54

Preprocessing steps:

Remove duplicate cases.

The first step is to check whether duplicate cases are contained in the data record. If this is the case, these can be removed from the data set.

# check if the data contains duplicates
duplicated_rows <- raw_example_data[duplicated(raw_example_data), ]

# drop duplicates
raw_example_data <- raw_example_data[!duplicated(raw_example_data), ]

head(raw_example_data[, 1:8], n = 5)
#> # A tibble: 5 × 8
#>      id treatment   sex  comp metabolite_1 metabolite_2  time metabolite_3
#>   <dbl>     <dbl> <dbl> <dbl>        <dbl>        <dbl> <dbl>        <dbl>
#> 1     1         0     0     1        1.88         0.750     1       -2.23 
#> 2     1         0     0     1        2.22        -0.167     2       -1.05 
#> 3     1         0     0     1        1.85         0.221     3       -0.681
#> 4     1         0     0     1        0.339        1.94      4       -2.36 
#> 5     1         0     0     1        1.16         1.20      5       -2.54

Reordering and dropping irrelevant columns.

If the data set contains columns that are irrelevant for further analysis, these can be removed from the data set.

# reorder columns
raw_example_data <- raw_example_data %>% relocate("time", .before = "metabolite_1")

# drop column measurement_complete since it is not used
raw_example_data <- raw_example_data %>% select(-comp)

head(raw_example_data[, 1:8], n = 5)
#> # A tibble: 5 × 8
#>      id treatment   sex  time metabolite_1 metabolite_2 metabolite_3
#>   <dbl>     <dbl> <dbl> <dbl>        <dbl>        <dbl>        <dbl>
#> 1     1         0     0     1        1.88         0.750       -2.23 
#> 2     1         0     0     2        2.22        -0.167       -1.05 
#> 3     1         0     0     3        1.85         0.221       -0.681
#> 4     1         0     0     4        0.339        1.94        -2.36 
#> 5     1         0     0     5        1.16         1.20        -2.54 
#> # ℹ 1 more variable: metabolite_4 <dbl>

Checking data types.

For the analysis in MeTEor, it is important that metabolite values are variables of the “numeric” type. The “time” variable should also be a variable of the type “numeric” or “integer”. The categorical variables can be of different data types.

# check if all data is numeric
non_numeric_columns <- names(raw_example_data)[!sapply(raw_example_data, is.numeric)]

# transform to numeric
cleaned_data <- raw_example_data %>% mutate_if(is.character, as.numeric)

# Select the numeric columns you want to normalize
columns_to_normalize <- 5:ncol(cleaned_data)

# Normalize the selected numeric columns
normalize <- function(x) {
  (x - min(x)) / (max(x) - min(x))
}

cleaned_data[, columns_to_normalize] <- apply(cleaned_data[, columns_to_normalize], 2, normalize)

head(cleaned_data[, 1:8], n = 5)
#> # A tibble: 5 × 8
#>      id treatment   sex  time metabolite_1 metabolite_2 metabolite_3
#>   <dbl>     <dbl> <dbl> <dbl>        <dbl>        <dbl>        <dbl>
#> 1     1         0     0     1           NA           NA        0.217
#> 2     1         0     0     2           NA           NA        0.392
#> 3     1         0     0     3           NA           NA        0.447
#> 4     1         0     0     4           NA           NA        0.199
#> 5     1         0     0     5           NA           NA        0.172
#> # ℹ 1 more variable: metabolite_4 <dbl>

Imputation

The dataset contains missing values, which need to be addressed before conducting analysis in MeTEor. Initially, it’s crucial to identify metabolites with missing data. Some metabolites may exhibit a higher prevalence of missing values. For instance, metabolites with more than 10% missing values can be filtered out initially. Subsequently, for the remaining missing values, techniques like k-nearest neighbor imputation can be employed to impute the missing data.


# Find columns with missing values
columns_with_missing <- names(cleaned_data)[colSums(is.na(cleaned_data)) > 0]
head(columns_with_missing)
#> [1] "metabolite_1"  "metabolite_2"  "metabolite_5"  "metabolite_8" 
#> [5] "metabolite_35" "metabolite_36"

# Remove columns with more than 10% NA
cleaned_data <- cleaned_data[, which(colMeans(!is.na(cleaned_data)) > 0.9)]

# impute columns using KNN
data_meta_imputed <- kNN(cleaned_data, numFun = weightedMean, weightDist=TRUE, imp_var = FALSE)
#> Warning in kNN(cleaned_data, numFun = weightedMean, weightDist = TRUE, imp_var
#> = FALSE): Nothing to impute, because no NA are present (also after using
#> makeNA)

head(data_meta_imputed[, 1:8], n = 5)
#>   id treatment sex time metabolite_3 metabolite_4 metabolite_6 metabolite_7
#> 1  1         0   0    1    0.2173387    0.5713050    0.4797063    0.7120786
#> 2  1         0   0    2    0.3923829    0.5010947    0.5095061    0.8321946
#> 3  1         0   0    3    0.4468586    0.5865550    0.4192313    0.3186937
#> 4  1         0   0    4    0.1986379    0.5804491    0.5659878    0.6971688
#> 5  1         0   0    5    0.1716386    0.6713395    0.3987762    0.7795458

Change data format

The data format is being transformed from wide to long format to make it compatible with MeTEor. This also involves reorganizing the columns to ensure that the dataset is structured appropriately.

data_meta_imputed_long <- data_meta_imputed  %>%
                              pivot_longer(!c("id", "time", "sex", "treatment"), 
                                           names_to = "metabolites", values_to = "values") %>%
                              relocate(time, .after = id) %>%
                              relocate(sex, .after = values) %>%
                              relocate(treatment, .after = values)

head(data_meta_imputed_long, n = 5)
#> # A tibble: 5 × 6
#>      id  time metabolites  values treatment   sex
#>   <dbl> <dbl> <chr>         <dbl>     <dbl> <dbl>
#> 1     1     1 metabolite_3  0.217         0     0
#> 2     1     1 metabolite_4  0.571         0     0
#> 3     1     1 metabolite_6  0.480         0     0
#> 4     1     1 metabolite_7  0.712         0     0
#> 5     1     1 metabolite_9  0.402         0     0

Export data set

write.csv(data_meta_imputed_long, file = "data_long.csv", row.names = FALSE )

Additional Information

For more methods and background on preprocessing metabolomics, refer to Karaman, I. Preprocessing and pretreatment of metabolomics data for statistical analysis (2017)