Here are instructions on how to use R for statistics:
Introductory Instructions:
?functionname
load("name.Rdata")
source("functionname.R")
newvarname = load.data("filename.csv")
names(newvarname)
singlevarname = newvarname$var
singlevarname = c(x1, x2, x3, etc)
where x1, x2, x3, etc. are quantitative data; orsinglevarname = c("d1", "d2", "d3", etc)
where "d1", "d2", "d3", etc. are qualitative datado(n)*function(arguments)
where n is the number of repetitions# this is a comment
Section 1.3: Simple Random Sampling
sample(singlevarname, n, replace=F)
where n is sample sizeSection 1.4: Other Effective Sampling Methods
systematic(singlevarname, n)
where n is sample sizestratified(varname, strataVarname, n)
where varname is variable containing the population,
strataVarname is the variable containing the levels (i.e., strata), and n is sample sizeSection 2.1: Organizing Qualitative Data
table(singlevarname)
table(singlevarname)/length(singlevarname)
Section 2.1: Organizing Qualitative Data: The Popular Displays
barplot(table(singlevarname), xlab="variable name", ylab="frequency", main="Long Descriptive Title")
barplot(table(singlevarname)/length(singlevarname), xlab="variable name", ylab="relative frequency", main="Long Descriptive Title")
pareto.chart(table(singlevarname), xlab="variable name", ylab="relative frequency", main="Long Descriptive Title")
barplot(xtabs(~ levelvariable + singlevarname ), xlab="variable name", ylab="frequency", main="Long Descriptive Title", beside = TRUE, legend.text = levels(levelvariable), col = c("colorname1", "colorname2", etc.)
Section 2.2: Organizing Quantitative Data: The Popular Displays
hist(singlevarname, xlab="variable name", main="Long Descriptive Title")
hist(singlevarname, freq=FALSE, xlab="variable name", ylab="relative frequency", main="Long Descriptive Title")
hist(singlevarname, xlab="variable name", main="Long Descriptive Title", breaks = c(b1, b2, b3, b4, etc))
where b1,
b2, b3, etc. are lower/upper class limitsstem(singlevarname, scale=value)
note: use scale=10 to move decimal place one place to left in stemplot;
use scale=100 to move decimal place two places to left in stemplot; use scale=0.1 to move decimal place one place to right in stemplot; etc.
stem.leaf.backback(var1, var2, back.to.back=T, unit=value, m=1)
note: use unit=10 to move decimal place one place to left in stemplot;
use unit=100 to move decimal place two places to left in stemplot; use unit=0.1 to move decimal place one place to right in stemplot; etc.
Section 3.1: Measures of Central Tendency
sum(singlevarname)/length(singlevarname)
; ormean(singlevarname)
median(singlevarname)
Section 3.2: Measures of Dispersion [a.k.a., Spread]
max(singlevarname) - min(singlevarname)
sum((mean(singlevarname)-singlevarname)^2)/length(singlevarname)
var(singlevarname)
sqrt(sum((mean(singlevarname)-singlevarname)^2)/length(singlevarname))
sigma(singlevarname)
sqrt(var(singlevarname))
; orsd(singlevarname)
qqnorm(singlevarname)
Section 3.4: Measures of Positions and Outliers
qnorm(k, mean(singlevarname), sd(singlevarname))
z = (xvalue - mean(singlevarname))/sd(singlevarname)
quantile(singlevarname, 0.25, type=2)
quantile(singlevarname, 0.75, type=2)
quantile(singlevarname, 0.75, type=2) - quantile(singlevarname, 0.25, type=2)
; orIQR(singlevarname, type=2)
lowerfence = quantile(singlevarname, 0.25, type=2) - 1.5*IQR(singlevarname, type=2)
upperfence = quantile(singlevarname, 0.75, type=2) + 1.5*IQR(singlevarname, type=2)
singlevarname[singlevarname < lowerfence]
singlevarname[singlevarname > upperfence]
Section 3.5: The Five Number Summary and Boxplots
fivenum(singlevarname)
boxplot(singlevarname, ylab="variable name", main="Long Descriptive Title")
– note: use
the argument horizontal=TRUE
to change the orientation of the boxplot – make sure that you also switch the labels
boxplot(singlevarname ~ levels, data=singlevarname, xlab="description of levels", ylab="variable name", main="Long Descriptive Title")
attach(newvarname)
par(mfrow=c(2,1))
hist(singlevarname, xlab="variable name", ylab="frequency", main="Long Descriptive Title")
boxplot(singlevarname, xlab="variable name", horizontal=TRUE)
detach(newvarname)
Section 4.1: Scatter Diagram and Correlation
plot(xvarname, yvarname, xlab="x variable name", ylab="y variable name", main="Long Descriptive Title")
cor(xvarname, yvarname, method="pearson", use="complete.obs")
– note: use="complete.obs"
removes any 'pair' of observations that has missing value(s)
Section 4.2: Least-Squares Regression
lm(formula = yvarname ~ xvarname)
– note: gives Intercept and coefficient of xvarname
plot(xvarname, yvarname, xlab="x variable name", ylab="y variable name", main="Long Descriptive Title")
abline(lm(formula = yvarname ~ xvarname))
Section 4.3: Coefficient of Determination
(cor(xvarname, yvarname, method="pearson"))^2
Section 6.1: Discrete Random Variables
x = c(x1, x2, x3)
p = c(p1, p2, p3)
barplot(x*p), xlab="variable name", ylab="probability", main="Long Descriptive Title")
x = c(x1, x2, x3)
p = c(p1, p2, p3)
sum(x*p)
x = c(x1, x2, x3)
p = c(p1, p2, p3)
sum((x-sum(x*p))^2*p)
x = c(x1, x2, x3)
p = c(p1, p2, p3)
sqrt(sum((x-sum(x*p))^2*p))
Section 6.2: The Binomial Probability Distribution
choose(n, r)
dbinom(k, prob=p, size=n)
sum(dbinom(a:b, prob=p, size=n))
barplot(dbinom(0:n, prob=p, size=n), names=0:n)
Section 7.1: Properties of the Normal Distribution
x = seq(-20,20,by=0.1)
y = dnorm(x, mean=m, sd=s)
plot(x, y)
pnorm(x, mean=m, sd=s)
pnorm(x, mean=m, sd=s, lower.tail=FALSE)
pnorm(a, mean=m, sd=s) - pnorm(b, mean=m, sd=s)
x = singlevarname
hist(x, freq=FALSE, xlab="variable name", ylab="relative frequency", main="Long Descriptive Title")
curve(dnorm(x, mean=mean(x), sd=sd(x)), add=TRUE)
Section 7.2: The Standard Normal Distribution
qnorm(α, mean=m, sd=s, lower.tail=FALSE)
pnorm(a)
pnorm(a, lower.tail=FALSE)
pnorm(b) - pnorm(a)
Section 7.3: Applications of the Normal Distribution
pnorm(a, mean=m, sd=s)
pnorm(a, mean=m, sd=s, lower.tail=FALSE)
pnorm(b, mean=m, sd=s) - pnorm(a, mean=m, sd=s)
qnorm(x, mean=m, sd=s)
where x is a decimal
Section 7.4: Assessing Normality
qqnorm(singlevarname)
Section 9.1: The Logic in Constructing Confidence Intervals for a Population Mean
zInterval(mean=samplemean, sd=population_sd, n=samplesize, conf.level=C-level)
where C-Level is confidence level as a decimal; the arguments may be used in any order
Section 9.2: Confidence Intervals for a Population Mean
qt(1 – α, df)
where α is area to the right; df = n – 1
source("tInterval.R")
tInterval(mean=samplemean, sd=population_sd, n=samplesize, conf.level=C-level)
where C-Level is confidence level as a decimal; the arguments may be used in any order
t.test(data1, mu=mu_0, conf.level=C-Level)
where data1 is variable containing data;
mu_0 is population mean used in hypotheses; and C-Level is confidence level as a decimal [i.e., 1 – α/2 or 1 – α]
Section 9.3: Confidence Intervals for a Polpulation Proportion
prop.test(x=number_of_successes, n=samplesize, p=p_0, conf.level=C-Level, alternative="two.sided")
where x is the number of successes;
n is sample size; p_0 is population proportion used in hypotheses; and C-Level is confidence level as a decimal [i.e., 1 – α/2 or 1 – α]
Section 10.3: Hypothesis Test for a Population Mean – Population Standard Deviation Unknown
t.test(data1, mu=mu_0, conf.level=C-Level, alternative="two.sided")
where data1 is variable containing data;
mu_0 is population mean used in hypotheses; C-Level is confidence level as a decimal [i.e., 1 – α/2 or 1 – α];
alternative is symbol used in alternate hypothesis: "less" for <, "greater" for >, "two.sided" for ≠
Section 10.4: Hypothesis Test for a Population Proportion
prop.test(x=number_of_successes, n=samplesize, p=p_0, conf.level=C-Level, alternative="two.sided")
where x is the number of successes;
n is sample size; C-Level is confidence level as a decimal [i.e., 1 – α/2 or 1 – α];
alternative is symbol used in alternate hypothesis: "less" for <, "greater" for >, "two.sided" for ≠
Section 11.1: Inferences about Two Means: Dependent Samples
t.test(data1, data2, paired=TRUE, conf.level=C-Level, alternative="two.sided")
where data1 and data2 are variables containing data;
C-Level is confidence level as a decimal [i.e., 1 – α/2 or 1 – α];
alternative is symbol used in alternate hypothesis: "less" for <, "greater" for >, "two.sided" for ≠
Section 11.2: Inferences about Two Means: Independent Samples
t.test(data1, data2, conf.level=C-Level)
where data1 and data2 are variables containing data;
C-Level is confidence level as a decimal [i.e., 1 – α/2 or 1 – α]
t.test(data1, data2, conf.level=C-Level, var.equal)
where data1 and data2 are variables containing data;
C-Level is confidence level as a decimal [i.e., 1 – α/2 or 1 – α]
t.test(data1, data2, conf.level=C-Level, alternative="two.sided")
where data1 and data22 are variables containing data;
C-Level is confidence level as a decimal [i.e., 1 – α/2 or 1 – α];
alternative is symbol used in alternate hypothesis: "less" for <, "greater" for >, "two.sided" for ≠
t.test(data1, data2, conf.level=C-Level, alternative="two.sided", var.equal)
where data1 and data2 are variables containing data;
C-Level is confidence level as a decimal [i.e., 1 – α/2 or 1 – α];
alternative is symbol used in alternate hypothesis: "less" for <, "greater" for >, "two.sided" for ≠
Section 11.3: Two-Sample Hypothesis Test for Population Proportions, using z-statistic
prop.test(x = c(x1,x2), n=c(n1,n2), conf.level=C-Level, alternative="two.sided")
where x1 and
x2 are the number of successes; where n1 and n2
are sample sizes; C-Level is confidence level as a decimal [i.e., 1 – α/2 or 1 – α]; alternative is symbol used in alternate hypothesis:
"less" for <, "greater" for >, "two.sided" for ≠
prop.test(x = c(x1,x2), n=c(n1,n2), conf.level=C-Level, alternative="two.sided")
where x1 and
x2 are the number of successes; where n1 and n2
are sample sizes; C-Level is confidence level as a decimal [i.e., 1 – α/2 or 1 – α]
Section 12.1: Goodness-of-Fit Test
qchisq(alpha, df)
chisq.test(c(x1,x2,x3),p=c(p1,p2,p3))
where x1, x2, x3, etc. are data;
p1, p2, p3, etc., are the respective probabilities that add to 1
Section 12.2: Tests for Independence and the Homogeneity of Proportions
rowvariable1 = c(c_{1,1},c_{1,2},c_{1,3})
where c_{1,1},c_{1,2},c_{1,3}, etc. are counts in the first row of matrix;rowvariable2 = c(c_{2,1},c_{2,2},c_{2,3})
where c_{2,1},c_{2,2},c_{2,3}, etc. are counts in the second row of matrix;rowvariable3 = c(c_{3,1},c_{3,2},c_{3,3})
where c_{3,1},c_{3,2},c_{3,3}, etc. are counts in the third row of matrix;chisq.test(data.frame(rowvariable1, rowvariable2, rowvariable3))
variable.fair = c(c_{1,1},c_{1,2},c_{1,3})
where c_{1,1},c_{1,2},c_{1,3}, etc. are counts in the first row of matrix;variable.bias = c(c_{2,1},c_{2,2},c_{2,3})
where c_{2,1},c_{2,2},c_{2,3}, etc. are counts in the second row of matrix;chisq.test(rbind(variable.fair, variable.bias))
chisq.test(rbind(variable.fair, variable.bias)) [['exp']]
Section x.x: One-Way Analysis of Variance (ANOVA)
variable1 = c(c_{1,1},c_{1,2},c_{1,3})
where c_{1,1},c_{1,2},c_{1,3}, etc. are values in sample 1;variable2 = c(c_{2,1},c_{2,2},c_{2,3})
where c_{1,1},c_{1,2},c_{1,3}, etc. are values in sample 2;variable3 = c(c_{3,1},c_{3,2},c_{3,3})
where c_{1,1},c_{1,2},c_{1,3}, etc. are values in sample 3;samplesvariable = data.frame(variable1, variable2, variable3)
samplesvariable = stack(samplesvariable)
oneway.test(values ~ ind, data=samplesvariable, var.equal=TRUE)