1. Introduction to R Environment

January 31, 2020 · p


489 * 20

5222/40.78

(32*8) + (1000/5.2)

log(7)

log10(7)

550^2

sqrt(302500)


489 * 20
[1] 9780

5222/40.78
[1] 128.053

(32*8) + (1000/5.2)
[1] 448.3077

log(7)
[1] 1.94591

log10(7)
[1] 0.845098

550^2
[1] 302500

sqrt(302500)
[1] 550

a <- 8

b <- 22

ls()

[1] "a" "b"

a
[1] 8

b
[1] 22

months <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)

months

 [1]  1  2  3  4  5  6  7  8  9 10 11 12

months2 <- 1:12

months2

 [1]  1  2  3  4  5  6  7  8  9 10 11 12

monthsC <- c("January", "February", "March", "April", "May", "June", 
        "July", "August", "September", "October", "November", "December")

monthsC

 [1] "January"   "February"  "March"     "April"     "May"       "June"     
 [7] "July"      "August"    "September" "October"   "November"  "December" 

year <- data.frame(months, monthsC)

year

   months   monthsC
1       1   January
2       2  February
3       3     March
4       4     April
5       5       May
6       6      June
7       7      July
8       8    August
9       9 September
10     10   October
11     11  November
12     12  December

jackhighC <- c(-2, 0, 6, 11, 17, 23, 28, 27, 22, 14, 4, -2)

jacklowC <- c(-15, -13, -8, -4, 0, 3, 5, 4, 0, -4, -9, -14)

```r


- Now let's make a numeric data frame with these data:


```r

jacktemp <- data.frame(jackhighC, jacklowC)

jacktemp

   jackhighC jacklowC
1         -2      -15
2          0      -13
3          6       -8
4         11       -4
5         17        0
6         23        3
7         28        5
8         27        4
9         22        0
10        14       -4
11         4       -9
12        -2      -14


jacktemp <- data.frame(year, jacktemp)


jacktemp

   months   monthsC jackhighC jacklowC
1       1   January        -2      -15
2       2  February         0      -13
3       3     March         6       -8
4       4     April        11       -4
5       5       May        17        0
6       6      June        23        3
7       7      July        28        5
8       8    August        27        4
9       9 September        22        0
10     10   October        14       -4
11     11  November         4       -9
12     12  December        -2      -14


(4*9/5)+13

jacktemp$jackhighF <- (jacktemp$jackhighC * 9/5) + 32

jacktemp


   months   monthsC jackhighC jacklowC jackhighF
1       1   January        -2      -15      28.4
2       2  February         0      -13      32.0
3       3     March         6       -8      42.8
4       4     April        11       -4      51.8
5       5       May        17        0      62.6
6       6      June        23        3      73.4
7       7      July        28        5      82.4
8       8    August        27        4      80.6
9       9 September        22        0      71.6
10     10   October        14       -4      57.2
11     11  November         4       -9      39.2
12     12  December        -2      -14      28.4


census <- read.csv("census.csv", header=T)

head(census)

       GEO_ID    Location Estimate MarginError
1 0400000US27   Minnesota  5527358       *****
2 0400000US28 Mississippi  2988762       *****
3 0400000US29    Missouri  6090062       *****
4 0400000US30     Montana  1041732       *****
5 0400000US31    Nebraska  1904760       *****
6 0400000US32      Nevada  2922849       *****

tail(census)

        GEO_ID      Location  Estimate MarginError
48 0400000US22     Louisiana   4663616       *****
49 0400000US23         Maine   1332813       *****
50 0400000US24      Maryland   6003435       *****
51 0400000US25 Massachusetts   6830193       *****
52 0400000US26      Michigan   9957488       *****
53   0100000US United States 322903030       *****

dim(census)

[1] 53  4

        GEO_ID  Location Estimate MarginError
    20 0400000US47 Tennessee  6651089       *****

        GEO_ID       Location Estimate MarginError
    15 0400000US41         Oregon  4081943       *****
    16 0400000US42   Pennsylvania 12791181       *****
    17 0400000US44   Rhode Island  1056611       *****
    18 0400000US45 South Carolina  4955925       *****
    19 0400000US46   South Dakota   864289       *****
    20 0400000US47      Tennessee  6651089       *****
    21 0400000US48          Texas 27885195       *****
    22 0400000US49           Utah  3045350       *****
    23 0400000US50        Vermont   624977       *****
    24 0400000US51       Virginia  8413774       *****
    25 0400000US54  West Virginia  1829054       *****

census2 <- census[1:52,]

dim(census2)

[1] 52  4

census2$Location

 [1] Minnesota            Mississippi          Missouri            
 [4] Montana              Nebraska             Nevada              
 [7] New Hampshire        New Jersey           New Mexico          
[10] New York             North Carolina       North Dakota        
[13] Ohio                 Oklahoma             Oregon              
[16] Pennsylvania         Rhode Island         South Carolina      
[19] South Dakota         Tennessee            Texas               
[22] Utah                 Vermont              Virginia            
[25] West Virginia        Washington           Wisconsin           
[28] Wyoming              Puerto Rico          Alabama             
[31] Alaska               Arizona              Arkansas            
[34] California           Colorado             Connecticut         
[37] Delaware             District of Columbia Florida             
[40] Georgia              Idaho                Hawaii              
[43] Illinois             Indiana              Iowa                
[46] Kansas               Kentucky             Louisiana           
[49] Maine                Maryland             Massachusetts       
[52] Michigan            

min(census2$Estimate)

[1] 581836

subset(census2, Estimate == 581836)

        GEO_ID Location Estimate MarginError
28 0400000US56  Wyoming   581836       *****

install.packages("vegan")

load(vegan)


if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

BiocManager::install("DESeq2")

library(devtools)

library(devtools)

devtools::install_github("rstudio/rmarkdown")

?plot

?mean

?sum

?hist

?cor.test

data(mtcars)

?mtcars

mtcars                package:datasets                 R Documentation

Motor Trend Car Road Tests

Description:

     The data was extracted from the 1974 _Motor Trend_ US magazine,
     and comprises fuel consumption and 10 aspects of automobile design
     and performance for 32 automobiles (1973-74 models).

Usage:

     mtcars
     
Format:

     A data frame with 32 observations on 11 (numeric) variables.

       [, 1]  mpg   Miles/(US) gallon                        
       [, 2]  cyl   Number of cylinders                      
       [, 3]  disp  Displacement (cu.in.)                    
       [, 4]  hp    Gross horsepower                         
       [, 5]  drat  Rear axle ratio                          
       [, 6]  wt    Weight (1000 lbs)                        
       [, 7]  qsec  1/4 mile time                            
       [, 8]  vs    Engine (0 = V-shaped, 1 = straight)      
       [, 9]  am    Transmission (0 = automatic, 1 = manual) 
       [,10]  gear  Number of forward gears                  
       [,11]  carb  Number of carburetors                 

min(mtcars$mpg)

[1] 10.4
 
subset(mtcars, mpg == 10.4)

                     mpg cyl disp  hp drat    wt  qsec vs am gear carb
Cadillac Fleetwood  10.4   8  472 205 2.93 5.250 17.98  0  0    3    4
Lincoln Continental 10.4   8  460 215 3.00 5.424 17.82  0  0    3    4

subset(mtcars, mpg == 33.9)

                mpg cyl disp hp drat    wt qsec vs am gear carb
Toyota Corolla 33.9   4 71.1 65 4.22 1.835 19.9  1  1    4    1

mean(mtcars$mpg)

[1] 20.09062

subset(mtcars, mpg < 21 & mpg > 19)

                  mpg cyl  disp  hp drat    wt  qsec vs am gear carb
Merc 280         19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
Pontiac Firebird 19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2
Ferrari Dino     19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6

var(mtcars$mpg)
[1] 36.3241
 
sd(mtcars$mpg)
[1] 6.026948

summary(mtcars)

      mpg             cyl             disp             hp       
 Min.   :10.40   Min.   :4.000   Min.   : 71.1   Min.   : 52.0  
 1st Qu.:15.43   1st Qu.:4.000   1st Qu.:120.8   1st Qu.: 96.5  
 Median :19.20   Median :6.000   Median :196.3   Median :123.0  
 Mean   :20.09   Mean   :6.188   Mean   :230.7   Mean   :146.7  
 3rd Qu.:22.80   3rd Qu.:8.000   3rd Qu.:326.0   3rd Qu.:180.0  
 Max.   :33.90   Max.   :8.000   Max.   :472.0   Max.   :335.0  
      drat             wt             qsec             vs        
 Min.   :2.760   Min.   :1.513   Min.   :14.50   Min.   :0.0000  
 1st Qu.:3.080   1st Qu.:2.581   1st Qu.:16.89   1st Qu.:0.0000  
 Median :3.695   Median :3.325   Median :17.71   Median :0.0000  
 Mean   :3.597   Mean   :3.217   Mean   :17.85   Mean   :0.4375  
 3rd Qu.:3.920   3rd Qu.:3.610   3rd Qu.:18.90   3rd Qu.:1.0000  
 Max.   :4.930   Max.   :5.424   Max.   :22.90   Max.   :1.0000  
       am              gear            carb      
 Min.   :0.0000   Min.   :3.000   Min.   :1.000  
 1st Qu.:0.0000   1st Qu.:3.000   1st Qu.:2.000  
 Median :0.0000   Median :4.000   Median :2.000  
 Mean   :0.4062   Mean   :3.688   Mean   :2.812  
 3rd Qu.:1.0000   3rd Qu.:4.000   3rd Qu.:4.000  
 Max.   :1.0000   Max.   :5.000   Max.   :8.000  


rnorm(10, 0, 1)

 [1] -0.07464322 -1.51165519  0.24215035 -1.01457525  0.12778428 -0.39497391
 [7] -1.98659094 -0.01535476 -1.90261294  0.76944419

norm10 <- rnorm(10, 5, 2)

norm100 <- rnorm(100, 5, 2)

norm1k <- rnorm(1000, 5, 2)

norm100k <- rnorm(100000, 5, 2)

library(tidyverse)

ggplot() + geom_histogram(aes(norm10))

ggplot() + geom_histogram(aes(norm100))

ggplot() + geom_histogram(aes(norm1k))

ggplot() + geom_histogram(aes(norm100k))

sample(1:25, 5, replace=TRUE)

[1]  3 13  4  6  7

sample(1:10, 15, replace=FALSE)

Error in sample.int(length(x), size, replace, prob) : 
  cannot take a sample larger than the population when 'replace = FALSE'


sample(1:10, 15, replace=TRUE)

 [1]  7  3 10  4  5  3  5  7 10  3 10  1  2  4  2

sample(1:10, 15, replace=TRUE)

 [1] 6 8 8 6 3 9 7 2 6 5 1 6 5 3 2


sed.seed(5)

sample(500:1000, 50, replace=TRUE)

 [1] 821 862 684 706 702 876 796 712 721 570 902 886 793 813 930 808 831 515 909
[20] 984 627 526 956 969 719 829 584 621 973 806 612 937 683 591 768 962 906 734
[39] 553 929 645 963 691 652 893 797 615 747 507 967

sample(500:1000, 50, replace=TRUE)

 [1] 719 903 864 990 603 735 944 622 962 825 648 590 854 571 550 933 579 919 507
[20] 980 965 831 815 880 589 626 724 514 873 599 708 647 624 741 638 636 503 502
[39] 911 654 726 503 526 739 668 921 629 771 904 687

pop <- sample(c("AA", "Aa", "aa"), 200, replace=TRUE)

pop

  [1] "AA" "AA" "AA" "Aa" "aa" "aa" "AA" "Aa" "AA" "AA" "Aa" "aa" "Aa" "AA" "aa"
 [16] "Aa" "AA" "aa" "aa" "Aa" "Aa" "Aa" "Aa" "aa" "aa" "aa" "AA" "aa" "AA" "aa"
 [31] "Aa" "aa" "aa" "AA" "AA" "Aa" "Aa" "AA" "Aa" "AA" "AA" "Aa" "aa" "AA" "AA"
 [46] "aa" "Aa" "aa" "AA" "Aa" "aa" "aa" "aa" "Aa" "AA" "Aa" "aa" "AA" "AA" "Aa"
 [61] "aa" "aa" "Aa" "AA" "aa" "Aa" "AA" "aa" "aa" "AA" "aa" "AA" "Aa" "AA" "AA"
 [76] "AA" "aa" "Aa" "aa" "aa" "aa" "AA" "AA" "aa" "aa" "AA" "AA" "Aa" "AA" "Aa"
 [91] "aa" "AA" "aa" "aa" "aa" "Aa" "aa" "aa" "aa" "Aa" "AA" "AA" "aa" "aa" "aa"
[106] "Aa" "AA" "AA" "Aa" "Aa" "aa" "Aa" "Aa" "aa" "AA" "aa" "aa" "aa" "AA" "aa"
[121] "AA" "Aa" "AA" "AA" "AA" "Aa" "aa" "AA" "AA" "aa" "aa" "Aa" "AA" "aa" "Aa"
[136] "Aa" "AA" "aa" "Aa" "aa" "aa" "AA" "AA" "Aa" "aa" "Aa" "AA" "Aa" "AA" "AA"
[151] "aa" "AA" "AA" "AA" "Aa" "aa" "AA" "AA" "Aa" "Aa" "aa" "aa" "Aa" "Aa" "Aa"
[166] "AA" "AA" "aa" "aa" "aa" "Aa" "Aa" "AA" "Aa" "Aa" "aa" "aa" "AA" "AA" "AA"
[181] "aa" "AA" "Aa" "AA" "Aa" "Aa" "Aa" "AA" "Aa" "Aa" "aa" "aa" "aa" "aa" "Aa"
[196] "Aa" "AA" "AA" "AA" "aa"

pop1 <- sample(pop, 50, replace=TRUE)

pop1

 [1] "aa" "AA" "AA" "AA" "AA" "Aa" "AA" "aa" "AA" "Aa" "aa" "AA" "aa" "Aa" "Aa"
[16] "Aa" "aa" "aa" "Aa" "Aa" "AA" "aa" "aa" "aa" "aa" "aa" "aa" "AA" "aa" "aa"
[31] "Aa" "Aa" "aa" "AA" "AA" "Aa" "Aa" "Aa" "aa" "aa" "aa" "aa" "Aa" "aa" "aa"
[46] "Aa" "aa" "aa" "AA" "AA"


sum(pop1 == "AA")
[1] 13

sum(pop1 == "Aa")
[1] 14

sum(pop1 == "aa")
[1] 23


data(airquality)

dim(airquality)
[1] 153   6


head(airquality)

    Ozone Solar.R Wind Temp Month Day
1      41     190  7.4   67     5   1
2      36     118  8.0   72     5   2
3      12     149 12.6   74     5   3
4      18     313 11.5   62     5   4
5      NA      NA 14.3   56     5   5
6      28      NA 14.9   66     5   6

DATA[sample(nrow(DATA), 50, replace=FALSE), ]


for (i in 1:20){
    print(i)
}

[1] 1
[1] 2
[1] 3
[1] 4
[1] 5
[1] 6
[1] 7
[1] 8
[1] 9
[1] 10
[1] 11
[1] 12
[1] 13
[1] 14
[1] 15
[1] 16
[1] 17
[1] 18
[1] 19
[1] 20

for (i in 1:10){
    print(i^2)
}


[1] 1
[1] 4
[1] 9
[1] 16
[1] 25
[1] 36
[1] 49
[1] 64
[1] 81
[1] 100


sample(0:10, 100, replace=TRUE)

  [1] 10  6  3  2  5  3 10 10  1  2 10  9  8 10  7  3  8  1  4  9 10  4  3  3  6
 [26] 10  8  2  9  7  1 10  0  4  0  3 10  5  6  2  8  8  5  4  8  9  5 10  5  7
 [51]  6  5  9  9  0 10 10  9  7  5  0  0  6  4  5  9  2 10  9  1  5  2  2  7 10
 [76]  7  6  4  3  7  6  5  2 10 10  4  4  9  0  2  8  1  1  5  0  3  3  8  2  5

replicate(4, sample(0:10, 100, replace=TRUE))

       [,1] [,2] [,3] [,4]
  [1,]    8    4    1    0
  [2,]    7    2    8   10
  [3,]    1    5    5    1
  [4,]   10    7    7    1
  [5,]    8    1    0    0
  [6,]    0    2    4    5
  [7,]    6    9   10    9
  [8,]    9    4    9    6
  [9,]    2    0   10    5
 [10,]    9    0    5    9
 [11,]    5    7    2    9
 [12,]    9    6    7    2
 [13,]    8    9    7    3
 [14,]    3   10    8    0
 [15,]    0    3    4   10
 [16,]    9    7    4    8
 [17,]    7    9    1    3
 [18,]    3    6   10    1
 [19,]    3    0    7    5
 [20,]    5    5   10   10
 [21,]    3   10    8    1
 [22,]    6    0   10    5
 [23,]    4    1    4    8
 [24,]    7    1    4   10
 [25,]    5    8    1    6
 [26,]   10    3    2    9
 [27,]    7    8    0    8
 [28,]    2    4    2    5
 [29,]    4    9    7    6
 [30,]    4    5    9    0
 [31,]    3   10    8    2
 [32,]    1    2    8    2
 [33,]    7   10    5    8
 [34,]    5    5    0    7
 [35,]    8    6   10    9
 [36,]    2    0    4    6
 [37,]    1    6    5    7
 [38,]   10    5   10    4
 [39,]    7    3    4    5
 [40,]    9    6    8   10
 [41,]    1    3    1    8
 [42,]    8    4    5    8
 [43,]    1    5    8    7
 [44,]    3    3    0   10
 [45,]    1    4    8    1
 [46,]    4    6    4    5
 [47,]    7   10    8    5
 [48,]    9    3    1    0
 [49,]    3    0    5    2
 [50,]    1    9    9    6
 [51,]    6    5    4    1
 [52,]    4    7    0    9
 [53,]    1    9   10    0
 [54,]    5    3    6    6
 [55,]    9    2    7    8
 [56,]    7   10    7    5
 [57,]    5   10    7   10
 [58,]   10    5    9    6
 [59,]    6    4    1    4
 [60,]    2    2    6   10
 [61,]    3    8    3    5
 [62,]    2    1    4    4
 [63,]   10    1    8    5
 [64,]   10    0    3    2
 [65,]    6    5    0    1
 [66,]    9    4    7    1
 [67,]    6    9    5    5
 [68,]    8    8    9    8
 [69,]    4    8    1    0
 [70,]    4    5    3    0
 [71,]    5    8    0    5
 [72,]    3    3    0   10
 [73,]    0   10    7    3
 [74,]    1    5    0    5
 [75,]    6    6    5    9
 [76,]    2    3    5    7
 [77,]    8    7    9    3
 [78,]    9    5    2   10
 [79,]    0    7    3    6
 [80,]    0    0   10    3
 [81,]    7    3    1   10
 [82,]    9    4   10    6
 [83,]    1    9    7    5
 [84,]    2    8    5    1
 [85,]    3    5   10    8
 [86,]    4    5    0   10
 [87,]    4    9    8    8
 [88,]    2    1    3    9
 [89,]    3    0    2    8
 [90,]    6    8    6    5
 [91,]    4    6    9    0
 [92,]    2    7    2    2
 [93,]    2    0    4   10
 [94,]    2    7    7    8
 [95,]    8    0    9    0
 [96,]    0    1   10    7
 [97,]   10    7    6    2
 [98,]    1    4    3    7
 [99,]    0    2    6    7
[100,]    9    0    2    8

scores <- replicate(4, sample(0:10, 100, replace=TRUE))

dim(scores)

[1] 100   4

hist(scores[,1], xlab="Question 1")

par(mfrow=c(2,2))       

for (i in 1:4){         
    x <- scores[,i]     
    hist(x,
        main = paste("Question ", i),
        xlab = "Scores",
        xlim = c(0,10))
}

x <- rnorm(1000)

y <- rnorm(1000)

t.test(x, y)

    Welch Two Sample t-test

data:  x and y
t = 1.0993, df = 1998, p-value = 0.2718
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.03860205  0.13706775
sample estimates:
  mean of x   mean of y 
-0.02702567 -0.07625852 


a <- rnorm(1000, mean=2)

b <- rnorm(1000, mean=4)

t.test(a, b)

    Welch Two Sample t-test

data:  a and b
t = -44.009, df = 1997.8, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -2.073730 -1.896795
sample estimates:
mean of x mean of y 
 2.023478  4.008740 

pop <- sample(c("FF", "Ff", "ff"), 5000, replace=TRUE)

length(pop)
[1] 5000

sum(pop == 'FF')
[1] 1726

sum(pop == 'Ff')
[1] 1626

sum(pop == 'ff')
[1] 1648

p <- (sum(pop == "FF")*2 + sum(pop == "Ff"))/10000

q = (sum(pop == "ff")*2 + sum(pop == "Ff"))/10000


p + q 
[1] 1

p^2
[1] 0.2578608

q^2
[1] 0.2422608

2*p*q
[1] 0.4998783


0.2578608 * 5000
[1] 1289.304


0.2422608 * 5000
[1] 1211.304


0.4998783 * 5000
[1] 2499.392

data(faithful)

attach(faithful)

cor.test(eruptions, waiting)

    Pearson's product-moment correlation

data:  eruptions and waiting
t = 34.089, df = 270, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.8756964 0.9210652
sample estimates:
      cor 
0.9008112 

cor.test(eruptions, waiting, method="Spearman")


    Spearman's rank correlation rho

data:  eruptions and waiting
S = 744659, p-value < 2.2e-16
alternative hypothesis: true rho is not equal to 0
sample estimates:
      rho 
0.7779721 

plot(eruptions, waiting)


plot(eruptions, waiting)

abline(lm(waiting ~ eruptions))

1. Introduction to R Environment

Note:

1. What is R?

2. What is R-Studio?

3. Getting Started

3.1 R as Calculator

3.2 Assignment Operators

3.3 Types of Objects

3.4 Manipulating Vectors

3.4.1 Task:

3.5 Creating Data Frames

3.5.1 Task:

3.6 Importing and Working with Data Frames

3.6.1 Task:

4. Maintaining Your R Installation

4.1 What is an R Package?

4.2 Package Repositories

4.3 Installing and Loading Packages

4.3.1 Task:

4.4 Getting Help Within R

4.5 Getting Help from Fellow R Users

5. Statistical Analysis

5.1 Summarizing Data

5.1.1 Task:

5.2 Simulating Distributions

5.2.1 Task:

5.3 Sampling

5.3.1 Task:

5.4 Repetitive Tasks

5.4.1 Task:

5.5 Student’s T-test

5.5.1 Task:

5.6 Chi-Squared Test

5.7 Correlation Tests

5.7.1 Task:

Geno	obs	exp	(O-E)^/E
FF	1726	1289.304	147.9119
Ff	1626	2499.392	305.1997
ff	1648	1211.304	157.6557
Total	5000	5000	610.7673