With reference to bullet point 2 of Challenge 1 of VAST Challenge 2022, I will reveal the patterns of community interactions of the city of Engagement, Ohio USA by using social network analysis approach.
packages = c('igraph', 'tidygraph',
'ggraph', 'visNetwork',
'lubridate', 'clock',
'tidyverse', 'graphlayouts',
'DT', "patchwork")
for(p in packages){
if(!require(p, character.only = T)){
install.packages(p)
}
library(p, character.only = T)
}
social <- read_csv("E:/data/Journals/SocialNetwork.csv")
part <- read_csv("E:/data/Attributes/Participants.csv")
log <- read_csv("E:/data/Activity Logs/ParticipantStatusLogs1.csv")
The SocialNetwork file contains a very long time period social network record of participants, there are nearly 7.5 million records, the dataset is to large for analysis, and people’s social activity has patterns, so in this take home exercise, I’ll only analyse the activities in two weeks, one choose from the beginning of this dataset, one at the end.
social <- social %>%
mutate(Weekday = wday(timestamp,
label = TRUE,
abbr = FALSE)) %>%
mutate(Month = month(timestamp)) %>%
mutate(Week = week(timestamp)) %>%
mutate(Year = year(timestamp))
As we can see the first week is week 9, but the first date is a Tuesday, so I choose 2022, week 10 as a sample and choose 2023, week 20 as the sample in the end.
log_attribute <- log %>%
filter(timestamp == log$timestamp[1])
glimpse(log_attribute)
write_rds(log_attribute, "E:/data/log_attribute.rds")
social_s <- read_rds("E:/data/social_1.rds")
social_e <- read_rds("E:/data/social_2.rds")
log_att <- read_rds("E:/data/log_attribute.rds")
job <- read_csv("E:/data/Attributes/Jobs.csv")
part <- read_csv("data/Attributes/Participants.csv")
employer <- read_csv("data/Attributes/Employers.csv")
social_s_edge_work_aggregated <- social_s %>%
filter(Weekday %in% c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday")) %>%
group_by(participantIdFrom, participantIdTo) %>%
summarise(Weight = n()) %>%
filter(participantIdFrom != participantIdTo) %>%
filter(Weight > 1) %>%
ungroup
social_s_edge_rest_aggregated <- social_s %>%
filter(Weekday %in% c("Saturday", "Sunday")) %>%
group_by(participantIdFrom, participantIdTo) %>%
summarise(Weight = n()) %>%
filter(participantIdFrom != participantIdTo) %>%
filter(Weight > 1) %>%
ungroup
social_e_edge_work_aggregated <- social_e %>%
filter(Weekday %in% c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday")) %>%
group_by(participantIdFrom, participantIdTo) %>%
summarise(Weight = n()) %>%
filter(participantIdFrom != participantIdTo) %>%
filter(Weight > 1) %>%
ungroup
social_e_edge_rest_aggregated <- social_e %>%
filter(Weekday %in% c("Saturday", "Sunday")) %>%
group_by(participantIdFrom, participantIdTo) %>%
summarise(Weight = n()) %>%
filter(participantIdFrom != participantIdTo) %>%
filter(Weight > 1) %>%
ungroup
part_s_w <- part %>%
filter(participantId %in% c(social_s_edge_work_aggregated$participantIdFrom, social_s_edge_work_aggregated$participantIdTo))
social_graph_s_w <- graph_from_data_frame(social_s_edge_work_aggregated,
vertices = part_s_w) %>%
as_tbl_graph() %>%
activate(edges) %>%
arrange(desc(Weight))
social_graph_s_w
# A tbl_graph: 853 nodes and 4068 edges
#
# A directed simple graph with 5 components
#
# Edge Data: 4,068 x 3 (active)
from to Weight
<int> <int> <int>
1 1 215 5
2 1 570 5
3 2 55 5
4 2 723 5
5 3 65 5
6 3 185 5
# ... with 4,062 more rows
#
# Node Data: 853 x 17
name householdSize haveKids age educationLevel interestGroup
<chr> <dbl> <lgl> <dbl> <chr> <chr>
1 0 3 TRUE 36 HighSchoolOrC~ H
2 1 3 TRUE 25 HighSchoolOrC~ B
3 2 3 TRUE 35 HighSchoolOrC~ A
# ... with 850 more rows, and 11 more variables: joviality <dbl>,
# age_group <chr>, jobId <dbl>, employerId <dbl>, hourlyRate <dbl>,
# startTime <dbl>, endTime <dbl>, daysToWork <chr>,
# educationRequirement <chr>, location <chr>, buildingId <dbl>
part_s_r <- part %>%
filter(participantId %in% c(social_s_edge_rest_aggregated$participantIdFrom, social_s_edge_rest_aggregated$participantIdTo))
social_graph_s_r <- graph_from_data_frame(social_s_edge_rest_aggregated,
vertices = part_s_r) %>%
as_tbl_graph() %>%
activate(edges) %>%
arrange(desc(Weight))
social_graph_s_r
# A tbl_graph: 780 nodes and 2390 edges
#
# A directed simple graph with 45 components
#
# Edge Data: 2,390 x 3 (active)
from to Weight
<int> <int> <int>
1 1 660 2
2 2 58 2
3 2 186 2
4 3 113 2
5 4 82 2
6 4 84 2
# ... with 2,384 more rows
#
# Node Data: 780 x 17
name householdSize haveKids age educationLevel interestGroup
<chr> <dbl> <lgl> <dbl> <chr> <chr>
1 1 3 TRUE 25 HighSchoolOrC~ B
2 2 3 TRUE 35 HighSchoolOrC~ A
3 3 3 TRUE 21 HighSchoolOrC~ I
# ... with 777 more rows, and 11 more variables: joviality <dbl>,
# age_group <chr>, jobId <dbl>, employerId <dbl>, hourlyRate <dbl>,
# startTime <dbl>, endTime <dbl>, daysToWork <chr>,
# educationRequirement <chr>, location <chr>, buildingId <dbl>
part_e_w <- part %>%
filter(participantId %in% c(social_e_edge_work_aggregated$participantIdFrom, social_e_edge_work_aggregated$participantIdTo))
social_graph_e_w <- graph_from_data_frame(social_e_edge_work_aggregated,
vertices = part_e_w) %>%
as_tbl_graph() %>%
activate(edges) %>%
arrange(desc(Weight))
social_graph_s_w
# A tbl_graph: 853 nodes and 4068 edges
#
# A directed simple graph with 5 components
#
# Edge Data: 4,068 x 3 (active)
from to Weight
<int> <int> <int>
1 1 215 5
2 1 570 5
3 2 55 5
4 2 723 5
5 3 65 5
6 3 185 5
# ... with 4,062 more rows
#
# Node Data: 853 x 17
name householdSize haveKids age educationLevel interestGroup
<chr> <dbl> <lgl> <dbl> <chr> <chr>
1 0 3 TRUE 36 HighSchoolOrC~ H
2 1 3 TRUE 25 HighSchoolOrC~ B
3 2 3 TRUE 35 HighSchoolOrC~ A
# ... with 850 more rows, and 11 more variables: joviality <dbl>,
# age_group <chr>, jobId <dbl>, employerId <dbl>, hourlyRate <dbl>,
# startTime <dbl>, endTime <dbl>, daysToWork <chr>,
# educationRequirement <chr>, location <chr>, buildingId <dbl>
part_e_r <- part %>%
filter(participantId %in% c(social_e_edge_rest_aggregated$participantIdFrom, social_e_edge_rest_aggregated$participantIdTo))
social_graph_e_r <- graph_from_data_frame(social_e_edge_rest_aggregated,
vertices = part_e_r) %>%
as_tbl_graph() %>%
activate(edges) %>%
arrange(desc(Weight))
social_graph_s_r
# A tbl_graph: 780 nodes and 2390 edges
#
# A directed simple graph with 45 components
#
# Edge Data: 2,390 x 3 (active)
from to Weight
<int> <int> <int>
1 1 660 2
2 2 58 2
3 2 186 2
4 3 113 2
5 4 82 2
6 4 84 2
# ... with 2,384 more rows
#
# Node Data: 780 x 17
name householdSize haveKids age educationLevel interestGroup
<chr> <dbl> <lgl> <dbl> <chr> <chr>
1 1 3 TRUE 25 HighSchoolOrC~ B
2 2 3 TRUE 35 HighSchoolOrC~ A
3 3 3 TRUE 21 HighSchoolOrC~ I
# ... with 777 more rows, and 11 more variables: joviality <dbl>,
# age_group <chr>, jobId <dbl>, employerId <dbl>, hourlyRate <dbl>,
# startTime <dbl>, endTime <dbl>, daysToWork <chr>,
# educationRequirement <chr>, location <chr>, buildingId <dbl>
g_s_w <- ggraph(social_graph_s_w,layout = 'kk') +
geom_edge_link(aes(width=Weight),
alpha=0.2) +
scale_edge_width(range = c(0.1, 0.5))+
geom_node_point(aes(colour = educationLevel),
size = 1)+
geom_node_text(aes(label = name),size=1.5, repel=TRUE)+
labs(subtitle = "Working days social network at begining")
g_s_r <- ggraph(social_graph_s_r,layout = 'kk') +
geom_edge_link(aes(width=Weight),
alpha=0.2) +
scale_edge_width(range = c(0.1, 0.5))+
geom_node_point(aes(colour = educationLevel),
size = 1)+
geom_node_text(aes(label = name),size=1.5, repel=TRUE)+
labs(subtitle = "Weekend social network at begining")
g_e_w <- ggraph(social_graph_e_w,layout = 'kk') +
geom_edge_link(aes(width=Weight),
alpha=0.2) +
scale_edge_width(range = c(0.1, 0.5))+
geom_node_point(aes(colour = educationLevel),
size = 1)+
geom_node_text(aes(label = name),size=1.5, repel=TRUE)+
labs(subtitle = "Working days social network at the end")
g_e_r <- ggraph(social_graph_e_r,layout = 'kk') +
geom_edge_link(aes(width=Weight),
alpha=0.2) +
scale_edge_width(range = c(0.1, 0.5))+
geom_node_point(aes(colour = educationLevel),
size = 1)+
geom_node_text(aes(label = name),size=1.5, repel=TRUE)+
labs(subtitle = "Weekends social network at the end")
g_s_w
g_s_r
g_e_w
g_e_r
From the the charts above, we can get some conclusions:
According to time change, the pattern of people’s social network have a significant change, we cannot say that people always follow a similar pattern in social network.
As time pass, the connection between the participants become closer, according to the working day in start and end, we can see there are more lines between participant, so the centre of this chart is very dark, weekend charts shows the same change, in the start, at weekend, some people only connected with 2 or 3 people, there is a social connection centre, but some people are not in this net. while, in the end, there are three centre in the social network, maybe as the time pass, three centres will become one
The social network pattern is quite different between working days and weekends.