library(ggplot2) # The idea of t.test is to evaluate if there are differences in a numeric variable # distribution between diferent groups of a nominal variable. # In order to demonstrate this, I will select the levels of the Fair and Ideal levels of the factor variable cut # Then we will compare the values a numeric variable among those two groups data = diamonds[diamonds$cut %in% c('Fair', 'Ideal'), ] data$cut = droplevels.factor(data$cut) # Drop levels that aren't used from the cut variable df1 = data[, c('cut', 'price')] # We can see the price means are different for each group tapply(df1$price, df1$cut, mean) # Fair Ideal # 4358.758 3457.542 ## Formula interface to t.test # The idea is that the numeric variable is explained by a group variable # t.test(numeric_variable ~ group_variable, data = data) # In our case. # The numeric_variable is price # The group_variable is cut t.test(price ~ cut, data = data) # Welch Two Sample t-test # # data: price by cut # t = 9.7484, df = 1894.8, p-value < 2.2e-16 # alternative hypothesis: true difference in means is not equal to 0 # 95 percent confidence interval: # 719.9065 1082.5251 # sample estimates: # mean in group Fair mean in group Ideal # 4358.758 3457.542 # Another way to validate the previous results is to just plot the distributions using a box-plot plot(price ~ cut, data = data, ylim=c(0,12000), col = 'deepskyblue3') png('t_test/ttest_boxplot.png') plot(price ~ cut, data = data, ylim=c(0,12000), col = 'deepskyblue3') dev.off()