# DATA SCIENCE IN A PANDEMIC# ## Professor Dennis F.X. Mathaisel ## This script entails a Line Chart visualizations, referenced as Figures 2 and 3 in the paper. ## Script developed by Abdullah Zahid under direction of Professor Mathaisel ## The dataset was taken from https://ourworldindata.org/coronavirus
#Libraries
install.packages(“ggvis”, dep=TRUE)
install.packages(“lattice”, dep=TRUE)
install.packages(“RJSplot”, dep=TRUE)
install.packages(“hrbrthemes”)
install.packages(“Hmisc”)
library(ggvis)
library(ggplot2)
library(dplyr)
library(hrbrthemes)
library(viridis)
library(tidyr)
library(Hmisc)
# Set the working directory
setwd(“C:/Docs/Papers/COVID-19/Journal Papers/1st Paper Data Science in a Pandemic/Data Science Journal/Revision Data Science Journal/Scripts/1st Paper Scripts Repository/Data”)
#Read File
covid<- read.csv(“owid-covid-data.csv”)
# covid<- read.csv(file.choose())
#Checking structure of Data
str(covid)
#Check if Data is Dataframe
is.data.frame(covid)
#Attributes of Data Frame
names(covid)
dim(covid)
class(covid)
length(covid)
attributes(covid)
#Summary of data
summary(covid)
#Change format for date
covid$date <- as.Date(covid$date, format = “%m/%d/%Y”)
str(covid)
#Filtering the most covid cases
for (i in 1:nrow(covid))
{
covid$Record[i]=i
topcovid = select(covid,iso_code,location,date,total_cases,new_cases,total_deaths,new_deaths)%>%
filter(iso_code %in% c(“BRA”, “FRA”, “DEU”, “IND”, “ITA”,”RUS”,”ESP”,”TUR”,”GBR”,”USA”))
}
#topcovid
str(topcovid)
#Missing Values
anyNA(topcovid$iso_code)
anyNA(topcovid$location)
anyNA(topcovid$date)
anyNA(topcovid$total_cases)
anyNA(topcovid$new_cases)
anyNA(topcovid$total_deaths)
anyNA(topcovid$new_deaths)
#Removing Missing Values
na.omit(topcovid$iso_code)
na.omit(topcovid$location)
na.omit(topcovid$date)
na.omit(topcovid$total_cases)
na.omit(topcovid$new_cases)
na.omit(topcovid$total_deaths)
na.omit(topcovid$new_deaths)
#Cases of Covid Since October
for (i in 1:nrow(topcovid))
{
topcovid$Record[i]=i
Subset1 = select(topcovid,iso_code,location,date,total_cases,new_cases,total_deaths,new_deaths)%>%
filter(date > as.Date(“2020-09-30”))
}
#####################################################################################################################
#Graph 1
#Line Graphs showing trends since October 2020
#Plot 1
Subset1 %>%
ggplot( aes(x=date, y=total_cases, group=location, color=location)) +
geom_line() +
ggtitle(“Cases of Covid Since October”) +
theme_ipsum() +
ylab(“Cases”)
#Cases of Covid Since November
for (i in 1:nrow(topcovid))
{
topcovid$Record[i]=i
Subset2 = select(topcovid,iso_code,location,date,total_cases,new_cases,total_deaths,new_deaths)%>%
filter(date > as.Date(“2020-10-31”))
}
#Plot 2
Subset2 %>%
ggplot( aes(x=date, y=total_cases, group=location, color=location)) +
geom_line() +
ggtitle(“Cases of Covid Since November”) +
theme_ipsum() +
ylab(“Cases”)
#Cases of Covid Since December
for (i in 1:nrow(topcovid))
{
topcovid$Record[i]=i
Subset3 = select(topcovid,iso_code,location,date,total_cases,new_cases,total_deaths,new_deaths)%>%
filter(date > as.Date(“2020-11-30”))
}
#Plot 3
Subset3 %>%
ggplot( aes(x=date, y=total_cases, group=location, color=location)) +
geom_line() +
ggtitle(“Cases of Covid Since December”) +
theme_ipsum() +
ylab(“Cases”)
#####################################################################################################################
#Graph 2
#Graph showing ratio of cases to deaths, however this plot was the least significant one in my opinion
covidGraph2 <- subset(topcovid, date==”2021-01-05″ & (iso_code== “BRA”|iso_code== “FRA”|iso_code== “DEU”|iso_code== “IND”|iso_code== “ITA”|iso_code== “RUS”|iso_code== “ESP”|iso_code==
“TUR”|iso_code== “GBR”|iso_code== “USA”),
select=c(date, location, total_cases, total_deaths))
testedPos <- covidGraph2$total_cases – covidGraph2$total_deaths
country <- rep(covidGraph2$location, each=2)
caseType <- rep(c(“Alive & Tested Positive”, “Deaths”), 5)
totalCases <- c(rbind(testedPos, covidGraph2$total_deaths))
data <- data.frame(country, caseType, totalCases)
ggplot(data, aes(fill=caseType, y=totalCases, x=country)) + geom_bar(position=”fill”, stat=”identity”) +
ggtitle(“Cases:Death ratio”)
#####################################################################################################################
#Graph 3
#Barplot showing cases per million of population
#Calculations Cases per million population
#(cases/(population/100000))
#Brazil
round(7961673/(209500000/1000000),digits=2)
#Spain
round(2024904/(46940000/1000000),digits=2)
#Italy
round(2220361/(60000000/1000000),digits=2)
#Germany
round(1886561/(84000000/1000000),digits=2)
#United States
round(21574043/(331000000/1000000),digits=2)
#India
round(10413417/(1353000000/1000000),digits=2)
#France
round(2763370/(66990000/1000000),digits=2)
#Turkey
round(2296102/(82000000/1000000),digits=2)
#United Kingdom
round(2898052/(66650000/1000000),digits=2)
#Russia
round(3297833/(144500000/1000000),digits=2)
#Barplot
barplot1 <- data.frame(
Country=c(“Brazil”,”Spain”,”Italy”,”Germany”,”USA”,”India”,”France”,”Turkey”,”United Kingdom”,”Russia”),
Cases=c(38003.21,43138.13,37006.02,22459.06,65178.38,7696.54,41250.49,28001.24,43481.65,22822.37)
)
#Barplot
ggplot(barplot1, aes(x=reorder(Country, Cases), y=Cases, label=Cases)) +
geom_bar(aes(fill=Country), stat= “identity”) +
coord_flip() +
geom_text(size=4) +
ylab(“Cases per million population”) +
xlab(“Location”) +
ggtitle(“Cases per Million Population for Top 10 Countries”)
#####################################################################################################################
#Graph 4
#Attempt was to see scatterplot of Cases to Deaths per cpuntry however the plot did not come correctly
#please check to see if you can correct this otherwise I am working on finding solution to this
#ScatterPlot
topcovid %>% ggvis(~total_cases, ~total_deaths) %>% layer_points(fill = ~factor(location))
#####################################################################################################################
# END OF SCRIPT
© 2024 Babson College. All rights reserved.