-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathR_Introduction_for_health_sciences_long.R
executable file
·1450 lines (1128 loc) · 45.7 KB
/
R_Introduction_for_health_sciences_long.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#### Introduction to R ####
# Maybe interesting:
# https://www.reddit.com/r/rstats/comments/1ak05u7/what_are_some_cool_r_packages_to_use_in_2024/
# Get your packages - functional extensions to base R:--------
if (!require(pacman)) {
install.packages("pacman")
library(pacman) # Install and load package in one
}
pacman::p_load(tidyverse, # https://tidyverse.tidyverse.org/
readxl,
writexl, # read write Excel files
DataExplorer, # Data overview
gtsummary, # Creates presentation-ready tables
table1, # nice table 1
flextable, # nicer tables
nycflights13, gapminder, Lahman, # data sets
devtools, # Collection of package development tools.
lubridate, # date functions
Hmisc, # Harrell Miscellaneous
data.table, # for fast operations on large data sets
psych, # A general purpose toolbox developed originally for personality, psychometric theory and experimental psychology.
visdat, # visualize missing values
tictoc, # timing
hexbin,
broman, # Bivariate Binning into Hexagon Cells
officer) # The officer package facilitates access to and manipulation of 'Microsoft Word' and 'Microsoft PowerPoint'
# Cite packages when you use them
citation("readxl")
sessionInfo() # to see everything about what is used in my current session.
today()
"2024-11-12"
# File-Info----
# The following code snippets and examples are by no means exhaustive and are
# intended to give a brief impression of R's functionality and explain the most
# important functions/methods.
# The main message of this section is to show possibilities and ways of thinking
# for data manipulation.
# A specific problem is usually solved in conjunction with online sources.
# ChatGPT, helps IMMENSELY in creating R code!
# Use Github Copilot for auto completion of code snippets!
# Many of the following explanations are based on
# https://r4ds.had.co.nz/introduction.html!
# https://r4ds.hadley.nz/ is the second edition.
# Thank you, Hadley!
# Many more free R introductions can be found online, see e.g., also YouTube.
# Also see the package learnr (top right under "Tutorial").
# Also see https://en.wikipedia.org/wiki/R_(programming_language) for an
# overview of the language itself.
# First things first:
# Why R (or Stata / SPSS / SAS....) and not just Excel? (see e.g., https://www.northeastern.edu/graduate/blog/r-vs-excel/)
# - ease-of-use: learning curve; point-and-click vs. programming
# - replicability
# - interaction with data, subsetting, transforming, visualizing ....
# - large datasets? (Since Excel 2007, a worksheet can contain 1,048,576 rows and 16,384 columns (A to XFD),
# comprising 17,179,869,184 cells. Before that, the size was limited to 65,536 rows and 256 columns (A to IV), comprising 16,777,216 cells.)
# Data sets with about one million to one billion records can also be processed in R, but need some additional effort
# - Price: R is free,
# - Speaking from experience: licenses are a pain!!
# - Expandability in Excel? -> packages in R!
# - ....
# Btw: For Excel, there is the R plug-in RExcel.
# 0) Before you start #####
# _RStudio layout----
# _How to setup your working environment-----
# - Relative paths (and trick below) or
# - RProjects (File -> New Project -> New Directory -> New Project)
# Trick: Set working directory to source file location:
setwd( dirname(rstudioapi::getSourceEditorContext()$path) )
getwd() # get working directory
# equivalent with:
# Menu: Session -> Set Working Directory -> To Source File Location
# _Execute a command from the script:----
# str/command + ENTER; or
# "Run" button (upper right)
# _What R-version do I have?---------
R.Version()
# Check if R-version is up-do-date automatically:
# If you have devtools installed:
devtools::source_url("https://github.com/jdegenfellner/ZHAW_Teaching/raw/main/Check_R_version_if_up_to_date.R")
# _R clear console: Command + L----
# _Clear (almost) entire RStudio:------
devtools::source_url("https://raw.githubusercontent.com/jdegenfellner/ZHAW_Teaching/main/Cleanup_RStudio.R")
# 1) Very basics ----
#
# c() concatenate
c(1,2)
x <- c(1,2,3,4,5) # create vector .... "<-" assignment operator
x
x = c(1,2,3,4,5)
class(x) # "numeric"
d <- c("Yes","No") # create string vector
class(d) # "character"
x
( x <- c(1,2,3,4,5) ) # show result immediately
2 -> y # other direction also possible
y
y = 12 # also possible
12 = y # error
rm(y) # remove y
y # object not found
# We start very simple:
# how to execute a primitive command, use R as calculator
1 + 2
sin(pi / 2)
log(12) # "ln"; natural log, Basis e
# important!!
?log # make use of the documentation by ? and function-name
help(log)
log( c(2,3) ) # function is applied to all elements in vector
log(2, base = 3) # basis 3
log(2,3,12) # error
# in general:
# function( ... arguments separated by "," .... )
# !!!
# Getting help and learning more: "give a man a fish and you feed him for a day;
# teach a man to fish and you feed him for a lifetime"
# -> Dr.Google :) <-
# and currently:
# GPT4o
# Naming conventions for variables?
# case sensitive
large_Variable <- 3
Large_Variable # not found
# sequences
seq(from = 1, to = 100, by = 2) # increase by 2 -> R create sequence....
y <- seq(1, 10, length.out = 5) # 5 equidistant points between 1 and 10
y
str(x) # structure, important!
flights
str(flights)
flights$year # go into column and show vector
flights$year[1] # first element
flights$year[1:10] # first 10 elements
class(flights)
View(flights) # opens new tab with non-editable overview
class(x)
class(flights)
x <- "2"
x*2 # error; Error in x * 2 : non-numeric argument to binary operator
log(2)
# simple statistics in base R:
mean(flights$arr_time) # NA (non applicable)
is.na(flights$arr_time) # logical vector
sum(is.na(flights$arr_time)) # how many NAs?
mean(flights$arr_time, na.rm = TRUE) # remove NAs; = 1502.055
mean(y) # mean {base}
median(y)
median(c(2,3,4,12)) # 3.5
median(c(2,3,4,120)) # 3.5 -> robust against outlier
median( c(x,y) ) # error, different type
median(c(2,3,4,12), c(45,2,3,1)) # What does this do?
# BIS HIERHER TAG 1----
x <- rnorm(1000) # sample of 1000 random number with X ~ N(mean = 0, sd = 1)
hist(x) # histogram in base R (later with ggplot2) -> see also: https://github.com/jdegenfellner/ZHAW_Teaching/blob/main/Density_plot_boxplot_below.R
boxplot(x) # base R boxplot
mean(x) # mean is an estimator for the population mean ~ 0
median(x)
sum(x <= -0.06026) # how many values are smaller than -0.06026?
sd(x) # standard deviation ~ 1
var(x) # variance # "erwartungstreu" (im Durchschnitt korrekt)
1/(length(x)-1)*sum( (x-mean(x))^2 ) # same
summary(x) # univariate summary: Min, 1st Quartile, Median, Mean, 3rd Quartile, Max
# Create functions:
polynom <- function(x, y = 4){ # y has default value of 4
x^2 + 3*x + 5*x*y + y^3 # last value is returned
}
polynom(1) # 1^1 + 3*1 + 5*1*4 + 4^3
1^1 + 3*1 + 5*1*4 + 4^3
polynom(x = 1,y = 0) # y = 0
polynom_useless <- function(x, y = 4){ # y has default value of 4
x^2 + 3*x + 5*x*y + y^3 # last value is returned
return(23) # please return the value 23 and ignore what came before
}
polynom_useless(x = 2, y = 4) # 23
# loops (that I need the most)
for(i in 1:5){
print(i)
}
for( i in c(1,-3,5,7) ){
print(i^3)
} # 1^2 3^2 5^2 ...
# sometimes a while loop is useful
i <- 1
while(i < 10){
print(i)
i <- i + 1
}
# 1.1) Tips and Tricks in R--------
# _document outline----
# _comment out whole sections:----
# step 1: mark the section you want to comment out
# step 2: shift + command + c
#
# x<-1
# y<-2
# z<-3
# expand....
# If not already mentioned: use the "document outline" in RStudio to
# navigate through your script and give it structure!
# 2) Data Visualization ----
str(mpg) # tibble vs data.frame # structure
class(mpg)
is.data.frame(mpg) # TRUE
is.numeric(mpg$manufacturer) # FALSE
is.numeric(mpg$year) # TRUE
View(mpg)
create_report(mpg) # DataExplorer, creates a quick overview as html
# conversions
mpg <- as.data.table(mpg)
mpg <- as_tibble(mpg)
is.data.frame(mpg) # TRUE
x <- c("1", "3", "5")
x
x[3]*3 # error
class(x)
x <- as.numeric(x) # conversion
x
is.numeric(x) # TRUE
?mpg # data set included in ggplot2
# Fuel economy data from 1999 to 2008 for 38 popular models of cars
View(mpg)
# What other data sets are in R?
?datasets # package ‘datasets’
library(help = "datasets")
# "$" Operator
mpg$trans
mpg$manufacturer[100:110] # elements 100 to 110 in manufacturer vector
# Subsetting
#mpg[Zeilen,Spalten]
# How to look at data in detail
mpg[1:10,] # first 10 rows
mpg[1:10, 2:3] # only rows 1 to 10 and columns 2 and 3
mpg[, 2:3] # all rows and columns 2 and 3
mpg[1:10, c("displ", "cyl")]
mpg[1:10, c("displ", "cyl")]$cyl
mpg[1:10, 4:7]
# change a single entry in a data set
mpg_ <- mpg
mpg_[1,1] <- "audi_neu"
mpg_[1,1]
mpg_
mpg[1:10, c("displ", "cyl")]
mpg[1:10, c(3,5)]
head(mpg) # show first 10 rows
tail(mpg, n = 12) # show last 10 rows
headTail(mpg) # first and last rows (package psych)
mpg
# Base R commands:
table(mpg$manufacturer) # frequency table
table(mpg$manufacturer)/sum(table(mpg$manufacturer))*100 # relative frequencies
sum(table(mpg$manufacturer)/sum(table(mpg$manufacturer))*100) # = 100 %
barplot( table(mpg$manufacturer) )
cor(mpg$displ, mpg$hwy) # correlation (Pearson)
plot(mpg$displ, mpg$hwy) # Scatterplot, x-y
# versus using ggplot2
mpg %>% # so-called pipe operator (also included in R by now as "|>")
ggplot(aes(x = fct_infreq(manufacturer) %>% fct_rev())) + # GPT: What does this command do...?"
geom_bar(stat = "count") + # add a barplot
coord_flip() + # flip the plot
xlab("") + ylab("") + # no labels
ggtitle("Title of the plot") +
theme(plot.title = element_text(hjust = 0.5))
# base R Plot:
plot(mpg$displ, mpg$hwy)
ggplot(data = mpg) +
geom_point(aes(x = displ, y = hwy))
ggplot(data = mpg) +
geom_point(aes(x = displ, y = hwy), position = "jitter")
# negative relationship between engine size (displ) and fuel efficiency (hwy)
ggplot(data = mpg) # creates an empty plot!
# GPT-Prompt: "Lets pimp this plot... title "Scatterplot for
# displacement and highway mileage", color red if displacement > 6,
# color green if displacement <4; add a smoothing line; omit the
# legend title, center the title"
# [first iteration, colors where not quite there yet...]
mpg <- mpg %>% mutate(cat_hwy = case_when(
displ < 4 ~ "green",
displ >= 4 & displ <= 6 ~ "blue",
displ > 6 ~ "red"
))
p <- ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) +
geom_point(aes(color = cat_hwy), position = position_jitter(width = 0.1, height = 0.1)) +
geom_smooth(method = "loess", se = FALSE) +
#geom_smooth(method = "lm", se = FALSE) + # lm = linear model
ggtitle("Scatterplot for Displacement and Highway Mileage") +
theme(plot.title = element_text(hjust = 0.5)) +
theme(legend.title = element_blank()) +
scale_color_manual(values = c("green" = "green", "blue" = "blue", "red" = "red"))
p
# BISHER TAG 2----
# What are the red dots and relatively fuel efficient points in the plot?
# Package "dplyr" by Hadley Wickham.
mpg %>% dplyr::filter(displ > 6 & hwy > 20)
# inefficient cars with displ > 6?
mpg %>% filter(displ > 6 & hwy < 20)
# most efficient cars
mpg %>% filter(displ < 2 & hwy > 40)
summary(mpg)
dim(mpg) # number of rows and cols
mpg[2,4] # look at single element in the data frame
#mpg[2,4] <- 1998 # change single entry
mpg[2,4]*2
as.numeric(mpg[2,4]) # convert to numeric
mpg[ order(mpg$manufacturer, decreasing = TRUE), ] # change order, sort in descending order for manufacturer
# or with dplyr:
mpg %>% arrange(desc(manufacturer)) # descending
# Save R whole workspace or single objects in R:
saveRDS(mpg, "mpg.RDS") # nice feature
mpg <- readRDS("mpg.RDS")
save.image(file = "my_workspace.RData")
load("my_workspace.RData")
# Very useful:
colnames(mpg) # column names of data frame
#colnames(mpg)[1] <- c("MANUF_new")
unique(mpg$class) # unique entries in vector
table(mpg$class) # frequency table (absolute frequence table)
table(mpg$class)/sum(table(mpg$class))*100 # relative frequencies
length(unique(mpg$class)) # how many different entries are there?
# One more dimension as information: car class
mpg %>% ggplot() +
geom_point(aes(x = displ, y = hwy, color = class)) + # different colors for vehicle class
ggtitle("Title")
ggplot(data = mpg) +
geom_point(aes(x = displ, y = hwy), color = "blue") +
xlab("displacement") +
ggtitle("Titel") # + ......
# nice feature: split plot into subplots
mpg %>% dplyr::filter(year %in% 1999:2001) %>%
ggplot() +
geom_point(aes(x = displ, y = hwy)) +
facet_wrap(~ class, nrow = 2) # stratify the plots after "class"
# Does the relationship between displ and hwy change within classes?
# two categorical variables
mpg %>% ggplot() +
geom_point(mapping = aes(x = displ, y = hwy)) +
facet_grid(drv ~ cyl) + # 12 sub groups -> 3 are empty; stratify by cyl and drv
ggtitle("Mileage vs displacement categorized via zylinders and drive train") +
xlab("engine displacement, in litres") +
ylab("highway miles per gallon") +
theme(plot.title = element_text(hjust = 0.5))
# There seem to be no data points with 4 or 5 cylinders and rear wheel drive
ggplot(data = mpg) +
geom_point(aes(x = displ, y = hwy)) +
facet_wrap(drv ~ cyl) + # 9 sub groups, the empty ones are omitted here
ggtitle("Mileage vs displacement categorized via zylinders and drive train") +
xlab("engine displacement, in litres") +
ylab("highway miles per gallon")
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point() +
geom_smooth(method = "loess") # add a smoothing line; https://en.wikipedia.org/wiki/Local_regression
# loess = locally estimated scatterplot smoothing
ggplot(mpg) +
geom_smooth(aes(x = displ, y = hwy, color = drv), show.legend = TRUE)
ggplot(data = mpg, aes(x = displ, y = hwy)) +
geom_point() +
geom_smooth(method = "lm") # "lm"=linear model. least squares estimated regression line
ggplot(data = mpg) +
geom_point(aes(x = displ, y = hwy), position = "jitter") # standard for parameter position="identity"
?geom_jitter
?datasets
library(help = "datasets")
# further examples
# data set diamonds
ggplot(data = diamonds) +
geom_bar(aes(x = cut)) # simple bar chart
bar <- ggplot(data = diamonds) +
geom_bar(
aes(x = cut, fill = cut), # fill: R has 657 built-in named colours, which can be listed with grDevices::colors().
show.legend = FALSE, # no legend
width = 0.5 # Bar width. By default, set to 90% of the resolution of the data.
) +
theme(aspect.ratio = 1) +
labs(x = NULL, y = NULL) # keine Labels
bar + coord_flip() # now graph is shown
bar + coord_polar() # do not do this!
# 3) READ data ----
# Reading data is often very (!) tedious; e.g., due to the simultaneous use
# of "." and "," as decimal separators or different definition of missing values!
# Furthermore, there could be various line breaks in the dataset,
# making reading less "straightforward"...
# You'll never find these difficulties with toy datasets.
# 3.1) Read directly from the web:--------
crab <- read.table("http://faculty.washington.edu/kenrice/rintro/crab.txt", # source
sep = " ", # column separator, here space
dec = ".", # decimal point
header = TRUE) # use first column for column names
# sep="\t" would read tab-separated files, always check the raw file to see the separator!
str(crab) # structure
View(crab) # works only in RStudio
summary(crab)
colnames(crab)
colnames(crab)[3]
# create new variable and add to data set
crab$new_var <- crab$width*10 # base R
# or
#crab <- as.data.table(crab)
#crab[, new_var_1 := width*10] # data.table syntax
# or
crab <- crab %>% mutate(new_var_3 = width*10) # dplyr
crab
crab %>%
dplyr::select(last_col(2):last_col()) # explicitely take "select" from dplyr (sometimes you have packages loaded with identical command-names...)
# expand...
# Delete Variable/Spalte:
crab$new_var_3 <- NULL # base R
crab %>% head()
# or
#crab[, new_var_1 := NULL] # data.table syntax
# or
crab <- crab %>% dplyr::select(-new_var)
crab %>% head() # worked
# 3.2) or read just from a local path on your hard disk--------
# Excel:
getwd() # get working directory
df <- readxl::read_xlsx("./Data/bike_excel_small.xlsx") # Note that there is (in my case) another command with the identical name in the package officer, hence I added the "readxl::"
# The command looks for a file named bike_excel_small.xlsx in the folder "Data" in the current working directory.
# Make sure that the current working directory is set correctly!
# column names: Do not use "Umlaute", " ", special characters, or numbers at the beginning of the column names!
# Example: Rotational_value_1,...
# Don't: "Erster Messung ß?"
str(df)
df$dteday <- as.POSIXct(df$dteday, format = "%d.%m.%Y") # Create date
str(df)
df$dteday[1]
df$dteday[2]
# We can now compare dates
df$dteday[1] < df$dteday[2] # TRUE
# BIS HIERHER TAG 3 ----------
# _write Excel:-------
df
getwd()
write_xlsx(df, "./Data/df_out.xlsx")
# _write Table to Word---------
ft <- flextable(df[, 1:5]) # only first 5 columns
# Save the flextable to a Word document
doc <- read_docx() %>%
body_add_flextable(ft) %>%
body_add_par("") # Add an empty paragraph for spacing
# Specify the output file path
output_path <- "./Data/df_table.docx"
print(doc, target = output_path)
# Text files
?read.csv
?read.csv2
?read.table
# Also check out: https://readr.tidyverse.org/
# 4) Data transformation using dplyr-----
# dplyr is part of tidyverse
?tidyverse # https://www.tidyverse.org/packages/
# Take careful note of the conflicts message that's printed when you load the tidyverse.
# It tells you that dplyr overwrites some functions in base R.
# If you want to use the base version of these functions after loading dplyr,
# you'll need to use their full names: stats::filter() and stats::lag().
?flights
str(flights)
# POSIXct: convenient to date calculations
# comparisons with >/</==
flights$time_hour[c(1,100)]
flights$time_hour[1:100]
# compare:
flights$time_hour[c(1)] < flights$time_hour[c(100)]
"YES" < "No"
# Tibbles are data frames, but slightly tweaked to work better in the tidyverse.
flights # larger data set, 336k rows
data.frame(x=1:10, y=1:10) # does not show dimensions and col types.
# __Important functions in dplyr, the basic grammar:-------
# - Pick observations by their values: filter().
# - Reorder the rows: arrange().
# - Pick variables by their names: select().
# - Create new variables with functions of existing variables: mutate().
# - Collapse many values down to a single summary: summarise().
# - These can all be used in conjunction with group_by() which changes the
# scope of each function from operating on the entire dataset to operating on it group-by-group.
# -->> These six functions provide the verbs for a language of data manipulation.
flights %>% filter(month == 1, day == 1) # "==" for comparisons
( dec25 <- flights %>% filter(month == 12, day == 25) ) # directly shows result
# saves the resulting tibble to dec25
( x <- 1:10 ) # directly shows result
# expand.....
# Fun (non-intuitive) Facts:
sqrt(2) ^ 2 == 2
1 / 49 * 49 == 1 # 0.02040816*49 = 0.9999998 (limited mantissa length)
options(digits = 22)
1/49
# 0.02040816326530612082046
options(digits = 10)
# google:
0.02040816326*49 # 0.99999999974
?near
near(sqrt(2) ^ 2, 2) # TRUE
near(1 / 49 * 49, 1)
# or:
abs(1 / 49 * 49 - 1) < 10^(-5) # TRUE
# logical Operators!!
1 == 1 # equal
1 > 2 # larger
3 >= 3 # larger or equal
3 != 4 # not equal
TRUE | TRUE # logical OR
TRUE | FALSE # logical OR
FALSE | FALSE # logical OR
TRUE & FALSE # logical AND
TRUE & TRUE # logical AND
c(TRUE,TRUE) & c(TRUE,FALSE) # element wise
xor(TRUE, TRUE) # exclusive OR
xor(TRUE, FALSE)
# very useful:
ind <- c(1,3,5,18,21)
for(i in ind){
print(i)
}
# very helpful:
1:60 %in% ind
1:60 %nin% ind # Hmisc package
# set operations
( x <- 1:7 )
( y <- 4:10 )
union(x, y) # union
setdiff(x, y) # difference "x - y"
setdiff(y, x) # is not symmetric
# Filter
flights %>% filter(month == 11 | month == 12)
# base R:
subset(flights, month == 11 | month == 12) # base R
identical( flights %>% filter(month == 11 | month == 12),
subset(flights, month == 11 | month == 12) ) # TRUE
# or:
flights %>% filter(month %nin% c(11, 12)) # %in%-operator
flights %>% filter(!(arr_delay > 120 | dep_delay > 120))
flights %>% filter(arr_delay <= 120 & dep_delay <= 120) # identical (De Morgan)
# Missing values - wichtig, da es dauernd vorkommt!
# NA .... 'Not Available' / Missing Values
NA > 5 # NA
10 == NA
NA + 10 # e.g. in mean() relevant
d <- c(1, 7, 56, NA)
mean(d, na.rm = FALSE) # NA; na.rm ... "NA remove"
(1+7+56+NA)/4
mean(d, na.rm = TRUE) # 21.33333333
# ..
NA / 2 # NA
NA == NA # why?
# Let x be Mary's age. We don't know how old she is.
x <- NA
# Let y be John's age. We don't know how old he is.
y <- NA
# Are John and Mary the same age?
x == y
# We don't know!
# check classes is.XXXXXX()
is.na(x) # very important!
is.na(d)
sum(is.na(d)) # = sum of missing values in vector d; FALSE FALSE FALSE TRUE ... 0,0,0,1
is.numeric(45.4) # num
class(45.4)
class(45)
is.character("hallo")
is.character("1")
is.character(1) # FALSE
# Recycling Bsp.!!
1:3 + 1:12 # recycling the shorter one
data.frame(x = 1:3, y = 1:12) # recycling the shorter one
data.frame(x = 1:3, y = 1:10) # Error, no multiple of the other
1:3 + 1:10 # What happens?
# pasting:
x <- 12
y <- 3
paste0(x, " and ", y) # 0 means no space between, different data types are allowed, here character and integer
paste0(x, y) # no space
paste(x, "and", y)
paste(x, "and", y, sep = "::")
x <- rnorm(100)
y <- 1.2*x + rnorm(100)
cor_label <- cor(x, y) # correlation
title_for_plot <- paste("Correlation is", round(cor_label, 2))
title_for_plot
ggplot(data = data.frame(x = x, y = y)) +
geom_point(aes(x = x, y = y)) +
ggtitle(title_for_plot) +
theme(plot.title = element_text(hjust = 0.5))
# arrange() works similarly to filter() except that instead of selecting rows, it changes their order.
flights %>% arrange(year, month, day)
flights %>% arrange(year, month, desc(day))
sum(is.na(flights)) # there are NAs in the data set
vis_miss(flights) # error... too many rows
flights %>% dplyr::slice_sample(n = 1000) %>% vis_miss() # only 1000 rows
sum(is.na(flights$year)) # = 0; not in years
# select() ... useful subset using operations based on the names of the variables.
flights %>% dplyr::select(year, month, day) # select only these three variables and show the result
flights %>% dplyr::select(year:day) # select range of variables from year to day. -> practical as you don't need to count column numbers
flights %>% dplyr::select(-(year:day)) # What happens here?
# useful:
#starts_with("abc") # matches names that begin with "abc".
flights %>% dplyr::select(starts_with("s"))
#ends_with("xyz") # matches names that end with "xyz".
#contains("ijk") # matches names that contain "ijk".
flights %>% dplyr::select(contains("time"))
flights %>% dplyr::rename(tail_num = tailnum) # rename
flights %>% dplyr::select(time_hour, air_time, everything()) # time_hour, air_time
# https://tidyselect.r-lib.org/reference/everything.html
# mutate() always adds new columns at the end of your dataset
flights_sml <- flights %>% dplyr::select(# smaller data set
year:day, # year, months, day
ends_with("delay"),
distance,
air_time
)
flights_sml %>% mutate(
gain = dep_delay - arr_delay,
speed = distance / air_time * 60
)
# head(flights_sml) # We did not save, same data set as before
flights_sml %>% transmute(# only keeps new columns
gain = dep_delay - arr_delay,
hours = air_time / 60,
gain_per_hour = gain / hours
)
# summarise() collapses a data frame to a single row
flights %>%
group_by(origin) %>%
summarise(delay = mean(dep_delay, na.rm = TRUE)) # simple mean
# BIS HIERHER TAG 4, 5.12.24---------
# __more PIPE-Operator----
# without pipe
by_dest <- group_by(flights, dest)
delay <- dplyr::summarise(by_dest,
count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)
)
delay <- filter(delay, count > 20, dest != "HNL")
delay
# It looks like delays increase with distance up to ~750 miles
# and then decrease. Maybe as flights get longer there's more
# ability to make up delays in the air?
delay %>% ggplot(aes(x = dist, y = delay)) +
geom_point(aes(size = count), alpha = 1/3) + # alpha .. Durchsichtigkeit der Punkte aus [0,1]
geom_smooth(se = FALSE) # se steht fuer standard error, zeichnet Konfidenzintervall um die geschaetzte Kurve.
# same with Pipe-Operator:
flights %>%
group_by(dest) %>%
dplyr::summarise(
count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)
) %>%
filter(count > 20, dest != "HNL") %>% ggplot(aes(x = dist, y = delay)) +
geom_point(aes(size = count), alpha = 1/3) +
geom_smooth(se = FALSE)
# Note: Working with the pipe is one of the key criteria for belonging to the tidyverse.
# The only exception is ggplot2: it was written before the pipe was discovered
# undo group
daily <- group_by(flights, year, month, day)
daily %>%
ungroup() %>% # no longer grouped by date
dplyr::summarise(flights = n()) # all flights
# Bsp: Find all groups bigger than a threshold:
flights %>% # Wieviele Fluege sind in jeder Destination?
group_by(dest) %>%
dplyr::summarise(n())
# Filter
flights %>%
group_by(dest) %>%
filter(n() > 365)
# 5) Exploratory Data Analysis EDA ----
# a) Generate questions about your data.
# b) Search for answers by visualizing, transforming, and modelling your data.
# c) Use what you learn to refine your questions and/or generate new questions.
# What type of variation occurs within my variables?
# What type of covariation occurs between my variables?
?diamonds # Prices of over 50,000 round cut diamonds
# Visualizing distributions
# simple frequency table
table(diamonds$cut) # categorical variable, is it ordinal or nominal?
diamonds <- diamonds %>% group_by(cut) %>% mutate(count_cut = n())
ggplot(data = diamonds ) +
geom_bar(aes(x = fct_reorder(cut, count_cut, .desc = TRUE)) ) + # z.B. falling order
xlab("")
# to examine the distribution of a continuous variable, use a histogram:
ggplot(data = diamonds) +
geom_histogram(aes(x = carat), binwidth = 0.5)
# This attempts to graphically represent the density of a continuous random variable
# overlay multiple histograms in the same plot
smaller <- diamonds %>%
filter(carat < 3) # consider, for example, only diamonds with <3 carats
ggplot(data = smaller, aes(x = carat, colour = cut)) +
geom_freqpoly(binwidth = 0.1)
ggplot(data = smaller, aes(x = carat)) +
geom_histogram(binwidth = 0.01) +
scale_x_continuous(breaks = seq(0, 3, by = 0.5))
# "The ideal weight for a classic engagement ring is between 1.00 and 1.50 carats.
# The average carat weight of a diamond engagement ring is slightly over 1 carat,
# specifically 1.18 carats. The average size of a diamond engagement ring
# varies by country and region."
# see https://github.com/jdegenfellner/ZHAW_Teaching/blob/main/Density_plot_boxplot_below.R
# for a nice histogram with boxplot below
# Are there outliers/unusual values?
# One definition could be:
# "An outlier is an observation that lies an abnormal distance from other values in a random sample from a population.
# In a sense, this definition leaves it up to the analyst (or a consensus process) to decide what will be considered
# abnormal. Before abnormal observations can be singled out, it is necessary to characterize normal observations."
# see also https://en.wikipedia.org/wiki/Outlier
ggplot(diamonds) +
geom_histogram(aes(x = y), binwidth = 0.5) # y = width in mm (0--58.9)
# zoom to small values of the y-axis
ggplot(diamonds) +
geom_histogram(aes(x = y), binwidth = 0.5) +
coord_cartesian(ylim = c(0, 50))
# look at unusual ones:
unusual <- diamonds %>%
filter(y < 3 | y > 20) %>% # | logical OR
dplyr::select(price, x, y, z) %>% # x ... length in mm (0--10.74)
arrange(y)
unusual
# Covariation
# If variation describes the behavior within a variable, covariation describes the behavior between variables.
# Covariation is the tendency for the values of two or more variables to vary together in a related way.
# Example: how the price of a diamond varies with its quality:
ggplot(data = diamonds, aes(x = price)) +
geom_freqpoly(aes(colour = cut), binwidth = 500)
# difficult to compare
# For a better comparison, consider the density, i.e., the area under the polygon is now 1.
ggplot(data = diamonds, mapping = aes(x = price, y = after_stat(density))) +
geom_freqpoly(mapping = aes(colour = cut), binwidth = 500)
# Maybe we can simply represent this as a boxplot:
ggplot(diamonds, aes(x = reorder(cut, price, FUN = median), y = price)) + # on the x-axis is the categorical variable
geom_boxplot() + # cut, on the y-axis the continuous variable price
xlab("")
# Comparison of central values is now easier
# Supports the counter-intuitive finding that better quality diamonds are cheaper on average!
# Visualization of two categorical variables
ggplot(data = diamonds) +
geom_count(mapping = aes(x = cut, y = color))
# Visualize two continuous variables
# often you can just use base-R plotting if you need something quick
plot(diamonds$carat, diamonds$price) # a bit ugly
ggplot(data = diamonds) +
geom_point(mapping = aes(x = carat, y = price))
# to make it look less cluttered, add transparency
ggplot(data = diamonds) +
geom_point(mapping = aes(x = carat, y = price), alpha = 1 / 100) # with alpha in [0,1]
# The trick with bins works not only in histograms, it also works in 2D
ggplot(data = diamonds) +
geom_bin2d(mapping = aes(x = carat, y = price))
ggplot(data = diamonds) +
geom_hex(mapping = aes(x = carat, y = price)) # library hexbin; even more aesthetic with hexagons
# You can also bin a single variable:
ggplot(data = diamonds, mapping = aes(x = carat, y = price)) +
geom_boxplot(mapping = aes(group = cut_width(carat, 0.1))) # cut_width() divides the variable into segments of length 0.1
# Patterns/Clusters
ggplot(data = faithful) +
geom_point(mapping = aes(x = eruptions, y = waiting)) +
xlab("Eruption time in mins") +
ylab("Waiting time to next eruption (in mins)")
# In 2D, you can see the clusters with the naked eye.
# -> longer wait times are associated with longer eruptions
# Multiple dimensions require multivariate methods -> see cluster analysis later!
# shorter
ggplot(faithful, aes(eruptions)) +
geom_freqpoly(binwidth = 0.25)
# 6) Wata Wrangling - the art of getting your data into R in a useful form for visualisation and modelling ----
# tibbles, sind data.frames, nur eine verbesserte Version davon.
str(iris)
class(iris)
iris_tibble <- as_tibble(iris)
class(iris_tibble) # ist also noch immer ein data.frame
# z.B. kann man Variablennamen fuer Spalten verwenden, die sonst nicht gehen:
tb <- tibble(
`:)` = "smile",
` ` = "space",
`2000` = "number"
)
tb
df <- data.frame( `:)` = "smile",
` ` = "space",
`2000` = "number")
df # gibt inkorrekte Spaltennamen
# es gibt weitere Unterschiede, sind aber im Moment nicht so entscheidend.
# illustrativ verwenden wir read_csv. Sehr oft sind Daten in Form von csv-Files gegeben.
# Versuchen wir einen realen (rechteckigen/tabularen) Datensatz von Bike-Rentals einzulesen:
bike <- read_csv("./Data/day.csv", col_names = TRUE)
bike #shows tibble