Skip to content

Latest commit

 

History

History
701 lines (554 loc) · 37.4 KB

R_Script.md

File metadata and controls

701 lines (554 loc) · 37.4 KB

R_Script.R

hoang 2023-03-18

library(tidyverse)
library (readxl)
library(gridExtra)
library(ggrepel)

getwd()
## [1] "C:/Users/hoang/Documents/Data Related/Data Projects/World Cup/World+Cup"
#GGPLOT Theme Setting

#FIRST STEP: READING AND CLEANING DATA

#1 WC Events, Date, Time, Location
WC_Events_uncleaned <- read_csv("data/world_cups.csv",col_names= TRUE)
## Rows: 22 Columns: 9
## ── Column specification ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Host Country, Winner, Runners-Up, Third, Fourth
## dbl (4): Year, Goals Scored, Qualified Teams, Matches Played
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(WC_Events_uncleaned)
## # A tibble: 6 × 9
##    Year `Host Country` Winner     `Runners-Up`   Third   Fourth     `Goals Scored` `Qualified Teams` `Matches Played`
##   <dbl> <chr>          <chr>      <chr>          <chr>   <chr>               <dbl>             <dbl>            <dbl>
## 1  1930 Uruguay        Uruguay    Argentina      USA     Yugoslavia             70                13               18
## 2  1934 Italy          Italy      Czechoslovakia Germany Austria                70                16               17
## 3  1938 France         Italy      Hungary        Brazil  Sweden                 84                15               18
## 4  1950 Brazil         Uruguay    Brazil         Sweden  Spain                  88                13               22
## 5  1954 Switzerland    Germany FR Hungary        Austria Uruguay               140                16               26
## 6  1958 Sweden         Brazil     Sweden         France  Germany FR            126                16               35
#remove white space between column name

names(WC_Events_uncleaned) <- gsub(" ", "",colnames(WC_Events_uncleaned))


#pivot data to longer

WC_Events <- WC_Events_uncleaned %>%  pivot_longer(c(Winner, `Runners-Up`, Third, Fourth), names_to = "Standing", values_to = "Country")


#2 WC Matches

#The wC_Matches had not included results for WC 2022 joining the data with another one to retrieve the newest 2022 data

WC_Matches2022 <- read_csv("data/2022_world_cup_matches.csv", col_names = TRUE)
## Rows: 64 Columns: 8
## ── Column specification ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Stage, Home Team, Away Team, Win Condition
## dbl  (2): ID, Year
## lgl  (1): Host Team
## date (1): Date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
WC_MatchesB42022 <- read_csv("data/B42022_world_cup_matches.csv", col_names = TRUE)
## Rows: 900 Columns: 10
## ── Column specification ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Stage, Home Team, Away Team, Win Condition
## dbl  (4): ID, Year, Home Goals, Away Goals
## lgl  (1): Host Team
## date (1): Date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(WC_Matches2022)
## # A tibble: 6 × 8
##      ID  Year Date       Stage       `Home Team`   `Away Team` `Win Condition` `Host Team`
##   <dbl> <dbl> <date>     <chr>       <chr>         <chr>       <chr>           <lgl>      
## 1     1  2022 2022-11-20 Group stage Qatar         Ecuador     <NA>            TRUE       
## 2     2  2022 2022-11-21 Group stage Senegal       Netherlands <NA>            FALSE      
## 3     3  2022 2022-11-21 Group stage England       Iran        <NA>            FALSE      
## 4     4  2022 2022-11-21 Group stage United States Wales       <NA>            FALSE      
## 5     5  2022 2022-11-22 Group stage France        Australia   <NA>            FALSE      
## 6     6  2022 2022-11-22 Group stage Denmark       Tunisia     <NA>            FALSE
#Using the temp_data to join and retrieve the readable data
temp_data <- read_csv("data/Fifa_world_cup_matches.csv", col_names = TRUE)
## Rows: 64 Columns: 88
## ── Column specification ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (7): team1, team2, possession team1, possession team2, possession in contest, date, category
## dbl  (80): number of goals team1, number of goals team2, total attempts team1, total attempts team2, conceded team1, conceded team2, goal inside the penalty area team1, goal inside ...
## time  (1): hour
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#Cleaning temp data to join converting data into lower case
temp_data$team1 <- str_to_title(tolower(temp_data$team1))
temp_data$team2 <- str_to_title(tolower(temp_data$team2))


#Joining and removing non-essential columns
                                            
WC_Matches2022 <- WC_Matches2022 %>% left_join(temp_data, by =c("Home Team"= "team1", "Away Team" = "team2")) %>% select(1:8, 12:13)

#Now Let's see what column this data have so we can rename column and do some cleaning

head(WC_Matches2022)
## # A tibble: 6 × 10
##      ID  Year Date       Stage       `Home Team`   `Away Team` `Win Condition` `Host Team` `number of goals team1` `number of goals team2`
##   <dbl> <dbl> <date>     <chr>       <chr>         <chr>       <chr>           <lgl>                         <dbl>                   <dbl>
## 1     1  2022 2022-11-20 Group stage Qatar         Ecuador     <NA>            TRUE                              0                       2
## 2     2  2022 2022-11-21 Group stage Senegal       Netherlands <NA>            FALSE                             0                       2
## 3     3  2022 2022-11-21 Group stage England       Iran        <NA>            FALSE                             6                       2
## 4     4  2022 2022-11-21 Group stage United States Wales       <NA>            FALSE                             1                       1
## 5     5  2022 2022-11-22 Group stage France        Australia   <NA>            FALSE                             4                       1
## 6     6  2022 2022-11-22 Group stage Denmark       Tunisia     <NA>            FALSE                             0                       0
#renaming column and relocate column and changing column type for union - here we can join Home goals with numberofgoalteam 1 and Away Goals with numberofgoalteam2
WC_Matches2022 <- WC_Matches2022 %>% rename('Home Goals' = 'number of goals team1', 'Away Goals' = 'number of goals team2') %>% relocate('Home Goals', .after = 'Home Team' ) 
WC_Matches2022 <- relocate(WC_Matches2022, 'Away Goals', .after = 'Away Team')
WC_Matches2022$`Home Goals` <- as.double(WC_Matches2022$`Home Goals`)
                               
#Now that the data is sharing the same format we can union
WC_Matches <- union_all(WC_MatchesB42022, WC_Matches2022)

#removing space between column
names(WC_Matches) <- gsub(" ", "",colnames(WC_Matches))


#let's take a look at the clean version
head(WC_Matches)
## # A tibble: 6 × 10
##      ID  Year Date       Stage       HomeTeam      HomeGoals AwayGoals AwayTeam WinCondition HostTeam
##   <dbl> <dbl> <date>     <chr>       <chr>             <dbl>     <dbl> <chr>    <chr>        <lgl>   
## 1     1  1930 1930-07-13 Group stage France                4         1 Mexico   <NA>         FALSE   
## 2     2  1930 1930-07-13 Group stage United States         3         0 Belgium  <NA>         FALSE   
## 3     3  1930 1930-07-14 Group stage Yugoslavia            2         1 Brazil   <NA>         FALSE   
## 4     4  1930 1930-07-14 Group stage Romania               3         1 Peru     <NA>         FALSE   
## 5     5  1930 1930-07-15 Group stage Argentina             1         0 France   <NA>         FALSE   
## 6     6  1930 1930-07-16 Group stage Chile                 3         0 Mexico   <NA>         FALSE
#3 International Matches - same logic

International_Matches <- read_csv("data/International_Matches.csv", col_names = TRUE)
## Rows: 17769 Columns: 9
## ── Column specification ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Tournament, Home Team, Away Team, Win Conditions
## dbl  (3): ID, Home Goals, Away Goals
## lgl  (1): Home Stadium
## date (1): Date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(International_Matches) <- gsub(" ", "",colnames(International_Matches))

head(International_Matches)
## # A tibble: 6 × 9
##      ID Tournament Date       HomeTeam HomeGoals AwayGoals AwayTeam WinConditions HomeStadium
##   <dbl> <chr>      <date>     <chr>        <dbl>     <dbl> <chr>    <chr>         <lgl>      
## 1     1 Friendly   1872-11-30 Scotland         0         0 England  <NA>          TRUE       
## 2     2 Friendly   1873-03-08 England          4         2 Scotland <NA>          TRUE       
## 3     3 Friendly   1874-03-07 Scotland         2         1 England  <NA>          TRUE       
## 4     4 Friendly   1875-03-06 England          2         2 Scotland <NA>          TRUE       
## 5     5 Friendly   1876-03-04 Scotland         3         0 England  <NA>          TRUE       
## 6     6 Friendly   1876-03-25 Scotland         4         0 Wales    <NA>          TRUE
#4 World cup Squads and Group

WC_Groups <- read_csv("data/2022_world_cup_groups.csv", col_names = TRUE)
## Rows: 32 Columns: 4
## ── Column specification ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Group, Team, Final_Standing
## dbl (1): FIFA Ranking
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(WC_Groups) <- gsub(" ", "",colnames(WC_Groups))

head(WC_Groups)
## # A tibble: 6 × 4
##   Group Team        FIFARanking Final_Standing
##   <chr> <chr>             <dbl> <chr>         
## 1 A     Qatar                50 Group Stage   
## 2 A     Ecuador              44 Group Stage   
## 3 A     Senegal              18 Top 16        
## 4 A     Netherlands           8 Top 8         
## 5 B     England               5 Top 8         
## 6 B     Iran                 20 Group Stage
#Moving on to the analysis steps, some of the insights we looking to answer
#1. Host Country Performance:

#Select the host country matches only
Host_Country_Matches <- filter(WC_Matches, HostTeam == TRUE)

head(Host_Country_Matches)
## # A tibble: 6 × 10
##      ID  Year Date       Stage          HomeTeam HomeGoals AwayGoals AwayTeam      WinCondition HostTeam
##   <dbl> <dbl> <date>     <chr>          <chr>        <dbl>     <dbl> <chr>         <chr>        <lgl>   
## 1     9  1930 1930-07-18 Group stage    Uruguay          1         0 Peru          <NA>         TRUE    
## 2    14  1930 1930-07-21 Group stage    Uruguay          4         0 Romania       <NA>         TRUE    
## 3    17  1930 1930-07-27 Semi-finals    Uruguay          6         1 Yugoslavia    <NA>         TRUE    
## 4    18  1930 1930-07-30 Final          Uruguay          4         2 Argentina     <NA>         TRUE    
## 5    25  1934 1934-05-27 Round of 16    Italy            7         1 United States <NA>         TRUE    
## 6    29  1934 1934-05-31 Quarter-finals Italy            1         1 Spain         <NA>         TRUE
#We use this data to find the furthest round that the host country make to

Host_Country_Best_Stage <- Host_Country_Matches %>% group_by(Year) %>% top_n(1,Date)

head(Host_Country_Best_Stage)
## # A tibble: 6 × 10
## # Groups:   Year [6]
##      ID  Year Date       Stage          HomeTeam    HomeGoals AwayGoals AwayTeam       WinCondition HostTeam
##   <dbl> <dbl> <date>     <chr>          <chr>           <dbl>     <dbl> <chr>          <chr>        <lgl>   
## 1    18  1930 1930-07-30 Final          Uruguay             4         2 Argentina      <NA>         TRUE    
## 2    35  1934 1934-06-10 Final          Italy               2         1 Czechoslovakia Extra time   TRUE    
## 3    48  1938 1938-06-12 Quarter-finals France              1         3 Italy          <NA>         TRUE    
## 4    75  1950 1950-07-16 Final round    Uruguay             2         1 Brazil         <NA>         TRUE    
## 5    95  1954 1954-06-26 Quarter-finals Switzerland         5         7 Austria        <NA>         TRUE    
## 6   136  1958 1958-06-29 Final          Sweden              2         5 Brazil         <NA>         TRUE
Host_Country_Best_Stage$Stage <- fct_recode(Host_Country_Best_Stage$Stage, "Final Round" ="Final", "Group Stage" = "Final round", "Group Stage" = "Second group stage", "Group Stage"="Group stage", "Quarter-final Round" = "Quarter-finals", "Round of 16" = "Round of 16", "Third-Fourth Round"= "Third place")


#We create a new column in that indicate whether the host country could make to the top 4 that year
WC_Events_Host_Standing<- WC_Events %>% 
  group_by(Year) %>%
  mutate(
    HostFinalStanding = if_else(
      HostCountry %in% head(Country, 4), 
      Standing, 
      "Out Top 4")) %>% 
  filter(HostCountry== Country | HostFinalStanding =="Out Top 4") %>% select(-Country, -Standing) %>% distinct()

head(WC_Events_Host_Standing)
## # A tibble: 6 × 6
## # Groups:   Year [6]
##    Year HostCountry GoalsScored QualifiedTeams MatchesPlayed HostFinalStanding
##   <dbl> <chr>             <dbl>          <dbl>         <dbl> <chr>            
## 1  1930 Uruguay              70             13            18 Winner           
## 2  1934 Italy                70             16            17 Winner           
## 3  1938 France               84             15            18 Out Top 4        
## 4  1950 Brazil               88             13            22 Runners-Up       
## 5  1954 Switzerland         140             16            26 Out Top 4        
## 6  1958 Sweden              126             16            35 Runners-Up
#In order to see host performance over time, create a numeric column that numerises the Host Final Standing in each WC

Host_Country_Data <- WC_Events_Host_Standing %>% 
  left_join(Host_Country_Best_Stage, by = "Year") 

Host_Country_Data <- Host_Country_Data %>% 
  mutate(HostFinalStandingDetailed = case_when(HostFinalStanding== "Out Top 4" ~ (case_match(as.character(Stage),"Quarter-finals" ~ "Top 8", "Round of 16" ~ "Top 16", "Final round"~ "Top 4", .default = "Group Stage")), .default = HostFinalStanding))
                             
Host_Country_Data <- Host_Country_Data %>% 
  mutate(HostFinalStandingNumber = case_match(HostFinalStandingDetailed, "Winner"~ 6,"Runners-Up" ~ 5, "Third" ~ 4, "Fourth" ~ 3,"Top 8" ~ 2, "Top 16"~1, .default=0))


#Let's graph the host country performance overtime
Host_Country_Performance_Chart <- ggplot(data = Host_Country_Data,mapping = aes( x = Year, y = HostFinalStandingNumber)) + geom_line(color = "red", size = 1, alpha = 0.5) + geom_text(aes(label = str_c(HostCountry,"\n", HostFinalStandingDetailed)), color = "black", vjust =0, size = 3) + theme(legend.position = "none") + ggtitle("Host Country Performance Over Time")  + scale_x_continuous(limits = c(1930, 2022), breaks = seq(1930,2022,4)) + theme(axis.title.y = element_blank(),axis.text.y = element_blank(), legend.position = "none")

ggsave("charts/Host_Country_Performance_Overtime.png", plot = Host_Country_Performance_Chart, width = 8, height = 6, units = "in")

Host_Country_Performance_Chart

#Now Let's see how many times the host countries make it to each round in every WC
Host_Data_Summary <- fct_count(Host_Country_Data$HostFinalStandingDetailed) %>% rename(Standing = "f", Times = "n")

Host_Data_Summary$Standing <- fct_relevel (Host_Data_Summary$Standing,"Winner", "Runners-Up", "Third", "Fourth", "Top 8","Top 16", "Group Stage")
## Warning: 1 unknown level in `f`: Top 8
Host_Country_Performance_Summarize <- ggplot(data = Host_Data_Summary, aes(x = Standing, y = Times)) + 
  geom_bar(stat = "identity", aes(fill = Times), color = "black") + 
  scale_fill_gradient(low = "grey", high = "red") + 
  ggtitle("Host Country Performance Summary") +
  xlab("") +
  geom_text(aes(label = Times), position = position_stack(vjust = 0.8), fontface = "bold") +
  guides(fill = "none")

ggsave("charts/Host_Country_Performance_Summarize.png", plot = Host_Country_Performance_Summarize, width = 8, height = 6, units = "in")

Host_Country_Performance_Summarize

#Next let's look at the Average Goal Scored and Average Goal Concede by each Host Country

Summarised_HostPerformance <- WC_Matches %>%
  filter(HostTeam==TRUE) %>%
  group_by(Year) %>%
  summarise(
    Average_Goal_Scored = mean(HomeGoals),
    Average_Goal_Conceded = mean(AwayGoals))



Summarised_QatarPerformance <- filter(Summarised_HostPerformance, Year == 2022 )
Qatar_Average_Goal_Scored <- Summarised_QatarPerformance$Average_Goal_Scored
Qatar_Average_Goal_Conceded <- Summarised_QatarPerformance$Average_Goal_Conceded


# Average Goal Scored Chart
Average_Goal_Scored <- ggplot(data = Summarised_HostPerformance, aes(x = Year, y = Average_Goal_Scored)) + 
  geom_line(color = "red", size = 1, alpha = 0.5) +
  ggtitle("Host Country Average Goal Scored Per Match") +
  theme(axis.title.y = element_text(size = 12), axis.text.y = element_text(size = 10)) +
  scale_x_continuous(limits = c(1930, 2022), breaks = seq(1930, 2022, 4)) +
  scale_y_continuous(breaks = seq(0, max(Summarised_HostPerformance$Average_Goal_Scored), 0.5)) +
  ylab("Average Goals") +
  xlab("Year") +
  geom_hline(aes(yintercept = Qatar_Average_Goal_Scored, linetype = "Average Goals Scored"), 
             color = "red", linetype = "dashed") +
  scale_linetype_manual(values=c("dashed"), guide = "none") +
  geom_label(aes(x = max(Year), y = Qatar_Average_Goal_Scored, label = "Qatar"), 
             color = "black", size = 4, vjust = 1, hjust = 0.2) +
  geom_text(aes(label = round(Average_Goal_Scored, 1)), nudge_y = 0.1, hjust = -0.2, size = 3.5) +
  theme(legend.position = "none")

ggsave("charts/Average_Goal_Scored_byHost.png", plot = Average_Goal_Scored, width = 8, height = 6, units = "in")

Average_Goal_Scored

# Chart with only Average Goal Conceded
Average_Goal_Conceded <- ggplot(data = Summarised_HostPerformance, aes(x = Year, y = Average_Goal_Conceded)) + 
  geom_line(color = "blue", size = 1, alpha = 0.5) +
  ggtitle("Host Country Average Goal Conceded Per Match") +
  theme(axis.title.y = element_text(size = 12), axis.text.y = element_text(size = 10)) +
  scale_x_continuous(limits = c(1930, 2022), breaks = seq(1930, 2022, 4)) +
  scale_y_continuous(breaks = seq(0, max(Summarised_HostPerformance$Average_Goal_Conceded), 0.5)) +
  ylab("Average Goals Conceded") +
  xlab("Year") +
  geom_hline(aes(yintercept = Qatar_Average_Goal_Conceded, linetype = "Average Goals Conceded"), 
             color = "blue", linetype = "dashed") +
  scale_linetype_manual(values=c("dashed"), guide = "none") +
  geom_label(aes(x = max(Year), y = Qatar_Average_Goal_Conceded, label = "Qatar"), 
             color = "black", size = 4, vjust = 0.1, hjust = 0.2) +
  geom_text(aes(label = round(Average_Goal_Conceded, 1)), nudge_y = -0.1, hjust = -0.2, size = 3.5) +
  theme(legend.position = "none")

ggsave("charts/Average_Goal_Conceded_byHost.png", plot = Average_Goal_Conceded, width = 8, height = 6, units = "in")

Average_Goal_Conceded

#2 Based on recent form and historical dominance which countries under performed and over performed


#Manipulate data to see historical dominance

head(WC_Events)
## # A tibble: 6 × 7
##    Year HostCountry GoalsScored QualifiedTeams MatchesPlayed Standing   Country       
##   <dbl> <chr>             <dbl>          <dbl>         <dbl> <chr>      <chr>         
## 1  1930 Uruguay              70             13            18 Winner     Uruguay       
## 2  1930 Uruguay              70             13            18 Runners-Up Argentina     
## 3  1930 Uruguay              70             13            18 Third      USA           
## 4  1930 Uruguay              70             13            18 Fourth     Yugoslavia    
## 5  1934 Italy                70             16            17 Winner     Italy         
## 6  1934 Italy                70             16            17 Runners-Up Czechoslovakia
WC_dominance_summary <- WC_Events %>%
  group_by(HostCountry, Standing) %>%
  count() %>%
  pivot_wider(names_from = Standing, values_from = n, values_fill = 0) %>% #transform this column values into multiple columns that count the number of times in top 4
  rename(Winner = "Winner", `Runners-Up` = "Runners-Up", Third = "Third", Fourth = "Fourth")

WC_dominance_summary <- WC_dominance_summary %>% mutate(Times_in_Top4 = rowSums(across(Winner:Fourth)))

head(WC_dominance_summary)
## # A tibble: 6 × 6
## # Groups:   HostCountry [6]
##   HostCountry Fourth `Runners-Up` Third Winner Times_in_Top4
##   <chr>        <int>        <int> <int>  <int>         <dbl>
## 1 Argentina        1            1     1      1             4
## 2 Brazil           2            2     2      2             8
## 3 Chile            1            1     1      1             4
## 4 England          1            1     1      1             4
## 5 France           2            2     2      2             8
## 6 Germany          2            2     2      2             8
#Let's graph the country that dominate in WC and their achievement:

WC_dominants <- WC_dominance_summary %>%
  pivot_longer(cols = -c(HostCountry,Times_in_Top4), names_to = "Place", values_to = "Sum") %>%
  mutate(Place = factor(Place, levels = c("Winner", "Runners-Up", "Third", "Fourth"))) %>%
  mutate(Country = fct_reorder(HostCountry, Sum, sum)) %>% 
  ggplot(aes(x = Country, y = Sum, fill = Place)) +
  geom_col(position = "stack", color = "black") +
  geom_text(aes(label = ifelse(Sum > 0, Sum, "")), position = position_stack(vjust = 0.5), size = 3) +
  coord_flip() +
  scale_fill_manual(values = c("#5DA5DA", "#FAA43A", "#60BD68", "#F17CB0")) +
  labs(x = "", y = "Performances", fill = "") +
  ggtitle("Top Performers in World Cup") +
  theme_minimal() + 
  scale_y_continuous(limits = c(0, 12), breaks = seq(0,12,2))

ggsave("charts/WC_dominants.png", plot = WC_dominants, width = 8, height = 6, units = "in")

WC_dominants

#Moving on to each country recent competition form, we use the International Matches data

head(International_Matches)
## # A tibble: 6 × 9
##      ID Tournament Date       HomeTeam HomeGoals AwayGoals AwayTeam WinConditions HomeStadium
##   <dbl> <chr>      <date>     <chr>        <dbl>     <dbl> <chr>    <chr>         <lgl>      
## 1     1 Friendly   1872-11-30 Scotland         0         0 England  <NA>          TRUE       
## 2     2 Friendly   1873-03-08 England          4         2 Scotland <NA>          TRUE       
## 3     3 Friendly   1874-03-07 Scotland         2         1 England  <NA>          TRUE       
## 4     4 Friendly   1875-03-06 England          2         2 Scotland <NA>          TRUE       
## 5     5 Friendly   1876-03-04 Scotland         3         0 England  <NA>          TRUE       
## 6     6 Friendly   1876-03-25 Scotland         4         0 Wales    <NA>          TRUE
#Data Transformation
International_Matches_Transformed <- 
  International_Matches %>% pivot_longer(cols = c(HomeTeam, AwayTeam), names_to = "Home_Away", values_to = "Country") %>% #pivoting Home Team and Away Team to one column only
  mutate(Goal_Scored = ifelse (Home_Away == "HomeTeam", HomeGoals, AwayGoals)) %>% #Cleaning columns with conditions
  filter( Country %in% (WC_Groups$Team)) #keep countries in the WC team only

#Let's evaluate the recent form by keeping the only top 5 most recent matches for each country:
International_Matches_Transformed_latest5 <- International_Matches_Transformed %>%
  arrange(Country, Date) %>%  # sort by Country and Date
  group_by(Country) %>%
  slice_tail(n = 5)  # keep only the last 5 matches for each Country

#Based on the goal scored of home team and away team we can determine the result of the matches:
International_Matches_Transformed_latest5 <- International_Matches_Transformed_latest5 %>%
    mutate(Result = case_when(
    Home_Away == "HomeTeam" & HomeGoals > AwayGoals ~ "Win",
    Home_Away == "HomeTeam" & HomeGoals < AwayGoals ~ "Loss",
    Home_Away == "AwayTeam" & HomeGoals > AwayGoals ~ "Loss",
    Home_Away == "AwayTeam" & HomeGoals < AwayGoals ~ "Win",
    TRUE ~ "Draw"
  ))

#Using that result we assigned 3 points to a win, 1 point to a draw and 0 point to a loss for a country, the sum of point will be used to determine the recent form of a WC participants
International_Matches_Form <- International_Matches_Transformed_latest5 %>% select(Country, Result) %>% 
  mutate(Result_numeric = case_match(Result, "Win" ~ 3,  "Draw" ~ 1,  "Loss"  ~ 0)) %>%
  group_by(Country) %>%
  summarise(Total_point = sum(Result_numeric)) %>%
  mutate(Form_B4_WC = case_when(Total_point>10 ~ "A", Total_point <5 ~ "C", .default = "B"))%>%
  arrange(desc(Total_point))

#Let's view the Match form data after all these steps so far
head(International_Matches_Form)
## # A tibble: 6 × 3
##   Country     Total_point Form_B4_WC
##   <chr>             <dbl> <chr>     
## 1 Brazil               15 A         
## 2 Argentina            13 A         
## 3 Croatia              13 A         
## 4 Netherlands          13 A         
## 5 Senegal              13 A         
## 6 Serbia               13 A
WC_participants_recentform <-ggplot(data = International_Matches_Form, aes(x = Total_point, y = reorder(Country,Total_point), color = Form_B4_WC)) +
  geom_point(size = 4) +
  scale_color_manual(values = c("green", "blue", "red"), name = "Form_B4_WC") +
  labs(x = "Total Points", y = "Country", title = "Team Performance Summary Based on last 5 matches") +
  theme_bw() +
  scale_x_continuous(breaks = seq(0, 15, by = 2))

ggsave("charts/WC_participants_recentform.png", plot = WC_participants_recentform, width = 8, height = 6, units = "in")

WC_participants_recentform

#Create a new tibble that join recent forms(numerical) to WC 2022 Countries
WC2022_Performance <-WC_Groups %>% left_join(International_Matches_Form, by = c("Team" = "Country")) %>% 
  left_join(WC_dominance_summary, by = c("Team" = "HostCountry")) %>%
  replace_na(list(Times_in_Top4 = 0)) %>%
  arrange(Team) %>% 
  mutate(Final_Standing_numeric = case_match(Final_Standing,"Winner" ~ 1, "Runners-Up"~2,"Third Place" ~ 3, "Fourth Place" ~ 4, "Top 8" ~ 5, "Top 16" ~ 6, .default = 7)) # numerise the final standing of WC2022 participants to display on graph 

WC2022_Performance$Final_Standing <- factor(WC2022_Performance$Final_Standing, levels = c("Group Stage", "Top 16", "Top 8", "Fourth Place", "Third Place", "Runners-Up", "Winner"))


WC_participants_Standing_vsForm <-ggplot(data = WC2022_Performance, aes(x = Final_Standing, y = Times_in_Top4, label = Team)) +
  geom_point(size =2 , color = "black") +
  ggrepel::geom_text_repel(aes(color = Form_B4_WC), size = 4, show.legend = TRUE, max.overlaps = Inf, nudge_x = 0.2, nudge_y = 0.2) +
  scale_color_manual(values = c("green", "blue", "red"), name = "Form_B4_WC") +
  labs(x = "Final Standing", y = "Times in Top 4", title = "Team Performance Summary Based on last 5 matches") +
  theme_bw() +
  scale_x_discrete(limits = c("Group Stage", "Top 16", "Top 8", "Fourth Place", "Third Place", "Runners-Up", "Winner"))

ggsave("charts/WC_participants_Standing_vsForm.png", plot = WC_participants_Standing_vsForm, width = 8, height = 6, units = "in")

WC_participants_Standing_vsForm

#3 Argentina 2022 road to glory compare to France 2018

#Let's filter for Argentina and Frances matches only
WC_Matches_Transformed <- 
  WC_Matches %>% pivot_longer(cols = c(HomeTeam, AwayTeam), names_to = "Home_Away", values_to = "Country") %>%
  mutate(Goal_Scored = ifelse (Home_Away == "HomeTeam", HomeGoals, AwayGoals)) %>%
  filter((Country == "France" & Year == 2018) | (Country =="Argentina" & Year == 2022))

#See above for transformation logic
WC_Matches_Transformed <- WC_Matches_Transformed %>%
  mutate(Result = case_when(
    Home_Away == "HomeTeam" & HomeGoals > AwayGoals ~ "Win",
    Home_Away == "HomeTeam" & HomeGoals < AwayGoals ~ "Loss",
    Home_Away == "AwayTeam" & HomeGoals > AwayGoals ~ "Loss",
    Home_Away == "AwayTeam" & HomeGoals < AwayGoals ~ "Win",
    TRUE ~ "Draw")) %>%
  mutate(Result_numeric_streak = case_match(Result, "Win" ~ 1,  "Draw" ~ 0,  "Loss"  ~ -1)) %>%
  arrange(Country,Date) %>%
  group_by(Country) %>%
  mutate(match_index = row_number())


# plot Year vs. Result_Numeric with Country as the grouping variable
Road_to_Glory <- ggplot(WC_Matches_Transformed, aes(x = match_index, y = Result_numeric_streak, color = Country)) +
  geom_line(size = 1) + geom_point(size =2) +
  geom_text(aes(label = paste(Result, ifelse(is.na(WinCondition),"",WinCondition), sep = "\n")), 
            hjust = -0.1, size = 3, show.legend = FALSE, color = "black")  +
  labs(title = "Road to glory Argentina 2022 vs 2018 France",
       x = "Match Index",
       y = "Result",
       color = "Country") +
  scale_color_manual(values = c("#75AADB", "darkblue")) +
  facet_wrap(~Country, ncol = 2) +
  theme(axis.text.y = element_blank())

ggsave("charts/Road_to_Glory.png", plot = Road_to_Glory, width = 8, height = 6, units = "in")

Road_to_Glory

#Let's compare the number of Goal Scored and Conceded between the two Countries

head(WC_Matches_Transformed)
## # A tibble: 6 × 14
## # Groups:   Country [1]
##      ID  Year Date       Stage          HomeGoals AwayGoals WinCondition                      HostTeam Home_Away Country   Goal_Scored Result Result_numeric_streak match_index
##   <dbl> <dbl> <date>     <chr>              <dbl>     <dbl> <chr>                             <lgl>    <chr>     <chr>           <dbl> <chr>                  <dbl>       <int>
## 1     8  2022 2022-11-22 Group stage            1         2 <NA>                              FALSE    HomeTeam  Argentina           1 Loss                      -1           1
## 2    24  2022 2022-11-26 Group stage            2         0 <NA>                              FALSE    HomeTeam  Argentina           2 Win                        1           2
## 3    39  2022 2022-11-30 Group stage            0         2 <NA>                              FALSE    AwayTeam  Argentina           2 Win                        1           3
## 4    50  2022 2022-12-03 Round of 16            2         1 <NA>                              FALSE    HomeTeam  Argentina           2 Win                        1           4
## 5    58  2022 2022-12-09 Quarter-finals         2         2 Argentina win on penalties (3 -4) FALSE    AwayTeam  Argentina           2 Draw                       0           5
## 6    61  2022 2022-12-13 Semi-finals            3         0 <NA>                              FALSE    HomeTeam  Argentina           3 Win                        1           6
Total_Goal_Scored <- WC_Matches_Transformed %>%
  group_by(Country) %>%
  summarize(Total_Goals_Scored = sum(ifelse(Home_Away == "HomeTeam", HomeGoals, AwayGoals)),
            Total_Goals_Conceded = sum(ifelse(Home_Away == "HomeTeam", AwayGoals, HomeGoals))) %>%
  ggplot(aes(x = Country, y = Total_Goals_Scored, fill = Country)) +
  geom_col(position = "dodge") +
  geom_text(aes(label=Total_Goals_Scored), position=position_dodge(width=0.9), vjust=-0.5) +
  facet_wrap(~ .) +
  labs(title = "Total Goals Scored",
       x = NULL, y = NULL) +
  guides(fill=FALSE) +
  scale_fill_manual(values = c("#75AADB", "darkblue")) +
  theme_bw() +
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        axis.line = element_line(colour = "black"))

ggsave("charts/Total_Goal_Scored_FranceArgentina.png", plot = Total_Goal_Scored, width = 8, height = 6, units = "in")


Total_Goal_Conceded <- WC_Matches_Transformed %>%
  group_by(Country) %>%
  summarize(Total_Goals_Scored = sum(ifelse(Home_Away == "HomeTeam", HomeGoals, AwayGoals)),
            Total_Goals_Conceded = sum(ifelse(Home_Away == "HomeTeam", AwayGoals, HomeGoals))) %>%
  ggplot(aes(x = Country, y = Total_Goals_Conceded, fill = Country)) +
  geom_col(position = "dodge") +
  geom_text(aes(label=Total_Goals_Conceded), position=position_dodge(width=0.9), vjust=-0.5) +
  facet_wrap(~ .) +
  labs(title = "Total Goals Conceded",
       x = NULL, y = NULL) +
  guides(fill=FALSE) +
  scale_fill_manual(values = c("#75AADB", "darkblue")) +
  theme_bw() +
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        axis.line = element_line(colour = "black"))

ggsave("charts/Total_Goal_Conceded_FranceArgentina.png", plot = Total_Goal_Conceded , width = 8, height = 6, units = "in")

grid.arrange(Total_Goal_Scored + scale_y_continuous(limits = c(0, 20), expand = c(0, 0)), 
             Total_Goal_Conceded + scale_y_continuous(limits = c(0, 20), expand = c(0, 0)), 
             ncol = 2, widths = c(4, 4), top = "Argentina vs France World Cup Stats")

#rmarkdown::render("R_Script.R", "github_document")