Take-Home Exercise 6

With reference to bullet point 2 of Challenge 1 of VAST Challenge 2022, I will reveal the patterns of community interactions of the city of Engagement, Ohio USA by using social network analysis approach.

LIU Zhenglin https://example.com/norajones (SMU SCIS)https://example.com/spacelysprokets
2022-06-06

Importing packages

packages = c('igraph', 'tidygraph', 
             'ggraph', 'visNetwork', 
             'lubridate', 'clock',
             'tidyverse', 'graphlayouts',
             'DT', "patchwork")
for(p in packages){
  if(!require(p, character.only = T)){
    install.packages(p)
  }
  library(p, character.only = T)
}

Importing Data

social <- read_csv("E:/data/Journals/SocialNetwork.csv")
part <- read_csv("E:/data/Attributes/Participants.csv")
log <- read_csv("E:/data/Activity Logs/ParticipantStatusLogs1.csv")

Data Wrangling

Choose a time period to analyse social network in Ohio

The SocialNetwork file contains a very long time period social network record of participants, there are nearly 7.5 million records, the dataset is to large for analysis, and people’s social activity has patterns, so in this take home exercise, I’ll only analyse the activities in two weeks, one choose from the beginning of this dataset, one at the end.

social <- social %>% 
  mutate(Weekday = wday(timestamp,
                        label = TRUE,
                        abbr = FALSE)) %>% 
  mutate(Month = month(timestamp)) %>% 
  mutate(Week = week(timestamp)) %>% 
  mutate(Year = year(timestamp))

As we can see the first week is week 9, but the first date is a Tuesday, so I choose 2022, week 10 as a sample and choose 2023, week 20 as the sample in the end.

social_1 <- social %>% 
  filter(Year == 2022) %>% 
  filter(Week == 10)
social_2 <- social %>% 
  filter(Year == 2023) %>% 
  filter(Week == 20)
write_rds(social_1, "E:/data/social_1.rds")
write_rds(social_2, "E:/data/social_2.rds")

Find more attributes for each participant

log_attribute <- log %>% 
  filter(timestamp == log$timestamp[1])
glimpse(log_attribute)
write_rds(log_attribute, "E:/data/log_attribute.rds")

Importing rds files to reduce the size of raw data

social_s <-  read_rds("E:/data/social_1.rds")
social_e <-  read_rds("E:/data/social_2.rds")
log_att <- read_rds("E:/data/log_attribute.rds")
job <- read_csv("E:/data/Attributes/Jobs.csv")
part <- read_csv("data/Attributes/Participants.csv")
employer <- read_csv("data/Attributes/Employers.csv")

Divide age group and join job id by participant id and use the new data frame as node.

log_att <- log_att %>% 
  select(participantId, jobId)
part <- part %>%
  mutate(age_group = cut(age, breaks = c(17,25,35,45,55,60))) %>% 
  left_join(log_att, by = "participantId")
part <- part %>% 
  left_join(job, by = "jobId")
part <- part %>% 
  left_join(employer, by = "employerId")

Change the orginal format into edge dataframe format

social_s_edge_work_aggregated <- social_s %>% 
  filter(Weekday %in% c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday")) %>% 
  group_by(participantIdFrom, participantIdTo) %>% 
  summarise(Weight = n()) %>% 
  filter(participantIdFrom != participantIdTo) %>% 
  filter(Weight > 1) %>% 
  ungroup

social_s_edge_rest_aggregated <- social_s %>% 
  filter(Weekday %in% c("Saturday", "Sunday")) %>% 
  group_by(participantIdFrom, participantIdTo) %>% 
  summarise(Weight = n()) %>% 
  filter(participantIdFrom != participantIdTo) %>% 
  filter(Weight > 1) %>% 
  ungroup

social_e_edge_work_aggregated <- social_e %>% 
  filter(Weekday %in% c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday")) %>% 
  group_by(participantIdFrom, participantIdTo) %>% 
  summarise(Weight = n()) %>% 
  filter(participantIdFrom != participantIdTo) %>% 
  filter(Weight > 1) %>% 
  ungroup

social_e_edge_rest_aggregated <- social_e %>% 
  filter(Weekday %in% c("Saturday", "Sunday")) %>% 
  group_by(participantIdFrom, participantIdTo) %>% 
  summarise(Weight = n()) %>% 
  filter(participantIdFrom != participantIdTo) %>% 
  filter(Weight > 1) %>% 
  ungroup
part_s_w <- part %>% 
  filter(participantId %in% c(social_s_edge_work_aggregated$participantIdFrom, social_s_edge_work_aggregated$participantIdTo))
social_graph_s_w <- graph_from_data_frame(social_s_edge_work_aggregated, 
                                          vertices = part_s_w) %>% 
  as_tbl_graph() %>% 
  activate(edges) %>%
  arrange(desc(Weight))
social_graph_s_w
# A tbl_graph: 853 nodes and 4068 edges
#
# A directed simple graph with 5 components
#
# Edge Data: 4,068 x 3 (active)
   from    to Weight
  <int> <int>  <int>
1     1   215      5
2     1   570      5
3     2    55      5
4     2   723      5
5     3    65      5
6     3   185      5
# ... with 4,062 more rows
#
# Node Data: 853 x 17
  name  householdSize haveKids   age educationLevel interestGroup
  <chr>         <dbl> <lgl>    <dbl> <chr>          <chr>        
1 0                 3 TRUE        36 HighSchoolOrC~ H            
2 1                 3 TRUE        25 HighSchoolOrC~ B            
3 2                 3 TRUE        35 HighSchoolOrC~ A            
# ... with 850 more rows, and 11 more variables: joviality <dbl>,
#   age_group <chr>, jobId <dbl>, employerId <dbl>, hourlyRate <dbl>,
#   startTime <dbl>, endTime <dbl>, daysToWork <chr>,
#   educationRequirement <chr>, location <chr>, buildingId <dbl>
part_s_r <- part %>% 
  filter(participantId %in% c(social_s_edge_rest_aggregated$participantIdFrom, social_s_edge_rest_aggregated$participantIdTo))
social_graph_s_r <- graph_from_data_frame(social_s_edge_rest_aggregated, 
                                          vertices = part_s_r) %>% 
  as_tbl_graph() %>% 
  activate(edges) %>%
  arrange(desc(Weight))
social_graph_s_r
# A tbl_graph: 780 nodes and 2390 edges
#
# A directed simple graph with 45 components
#
# Edge Data: 2,390 x 3 (active)
   from    to Weight
  <int> <int>  <int>
1     1   660      2
2     2    58      2
3     2   186      2
4     3   113      2
5     4    82      2
6     4    84      2
# ... with 2,384 more rows
#
# Node Data: 780 x 17
  name  householdSize haveKids   age educationLevel interestGroup
  <chr>         <dbl> <lgl>    <dbl> <chr>          <chr>        
1 1                 3 TRUE        25 HighSchoolOrC~ B            
2 2                 3 TRUE        35 HighSchoolOrC~ A            
3 3                 3 TRUE        21 HighSchoolOrC~ I            
# ... with 777 more rows, and 11 more variables: joviality <dbl>,
#   age_group <chr>, jobId <dbl>, employerId <dbl>, hourlyRate <dbl>,
#   startTime <dbl>, endTime <dbl>, daysToWork <chr>,
#   educationRequirement <chr>, location <chr>, buildingId <dbl>
part_e_w <- part %>% 
  filter(participantId %in% c(social_e_edge_work_aggregated$participantIdFrom, social_e_edge_work_aggregated$participantIdTo))
social_graph_e_w <- graph_from_data_frame(social_e_edge_work_aggregated, 
                                          vertices = part_e_w) %>% 
  as_tbl_graph() %>% 
  activate(edges) %>%
  arrange(desc(Weight))
social_graph_s_w
# A tbl_graph: 853 nodes and 4068 edges
#
# A directed simple graph with 5 components
#
# Edge Data: 4,068 x 3 (active)
   from    to Weight
  <int> <int>  <int>
1     1   215      5
2     1   570      5
3     2    55      5
4     2   723      5
5     3    65      5
6     3   185      5
# ... with 4,062 more rows
#
# Node Data: 853 x 17
  name  householdSize haveKids   age educationLevel interestGroup
  <chr>         <dbl> <lgl>    <dbl> <chr>          <chr>        
1 0                 3 TRUE        36 HighSchoolOrC~ H            
2 1                 3 TRUE        25 HighSchoolOrC~ B            
3 2                 3 TRUE        35 HighSchoolOrC~ A            
# ... with 850 more rows, and 11 more variables: joviality <dbl>,
#   age_group <chr>, jobId <dbl>, employerId <dbl>, hourlyRate <dbl>,
#   startTime <dbl>, endTime <dbl>, daysToWork <chr>,
#   educationRequirement <chr>, location <chr>, buildingId <dbl>
part_e_r <- part %>% 
  filter(participantId %in% c(social_e_edge_rest_aggregated$participantIdFrom, social_e_edge_rest_aggregated$participantIdTo))
social_graph_e_r <- graph_from_data_frame(social_e_edge_rest_aggregated, 
                                          vertices = part_e_r) %>% 
  as_tbl_graph() %>% 
  activate(edges) %>%
  arrange(desc(Weight))
social_graph_s_r
# A tbl_graph: 780 nodes and 2390 edges
#
# A directed simple graph with 45 components
#
# Edge Data: 2,390 x 3 (active)
   from    to Weight
  <int> <int>  <int>
1     1   660      2
2     2    58      2
3     2   186      2
4     3   113      2
5     4    82      2
6     4    84      2
# ... with 2,384 more rows
#
# Node Data: 780 x 17
  name  householdSize haveKids   age educationLevel interestGroup
  <chr>         <dbl> <lgl>    <dbl> <chr>          <chr>        
1 1                 3 TRUE        25 HighSchoolOrC~ B            
2 2                 3 TRUE        35 HighSchoolOrC~ A            
3 3                 3 TRUE        21 HighSchoolOrC~ I            
# ... with 777 more rows, and 11 more variables: joviality <dbl>,
#   age_group <chr>, jobId <dbl>, employerId <dbl>, hourlyRate <dbl>,
#   startTime <dbl>, endTime <dbl>, daysToWork <chr>,
#   educationRequirement <chr>, location <chr>, buildingId <dbl>

Social Network Plot

g_s_w <- ggraph(social_graph_s_w,layout = 'kk') +
  geom_edge_link(aes(width=Weight),
                 alpha=0.2) +
  scale_edge_width(range = c(0.1, 0.5))+
  geom_node_point(aes(colour = educationLevel), 
                  size = 1)+
  geom_node_text(aes(label = name),size=1.5, repel=TRUE)+
  labs(subtitle = "Working days social network at begining")

g_s_r <- ggraph(social_graph_s_r,layout = 'kk') +
  geom_edge_link(aes(width=Weight),
                 alpha=0.2) +
  scale_edge_width(range = c(0.1, 0.5))+
  geom_node_point(aes(colour = educationLevel), 
                  size = 1)+
  geom_node_text(aes(label = name),size=1.5, repel=TRUE)+
  labs(subtitle = "Weekend social network at begining")

g_e_w <- ggraph(social_graph_e_w,layout = 'kk') +
  geom_edge_link(aes(width=Weight),
                 alpha=0.2) +
  scale_edge_width(range = c(0.1, 0.5))+
  geom_node_point(aes(colour = educationLevel), 
                  size = 1)+
  geom_node_text(aes(label = name),size=1.5, repel=TRUE)+
  labs(subtitle = "Working days social network at the end")

g_e_r <- ggraph(social_graph_e_r,layout = 'kk') +
  geom_edge_link(aes(width=Weight),
                 alpha=0.2) +
  scale_edge_width(range = c(0.1, 0.5))+
  geom_node_point(aes(colour = educationLevel), 
                  size = 1)+
  geom_node_text(aes(label = name),size=1.5, repel=TRUE)+
  labs(subtitle = "Weekends social network at the end")


g_s_w
g_s_r
g_e_w
g_e_r

Conclusion

From the the charts above, we can get some conclusions:

  1. According to time change, the pattern of people’s social network have a significant change, we cannot say that people always follow a similar pattern in social network.

  2. As time pass, the connection between the participants become closer, according to the working day in start and end, we can see there are more lines between participant, so the centre of this chart is very dark, weekend charts shows the same change, in the start, at weekend, some people only connected with 2 or 3 people, there is a social connection centre, but some people are not in this net. while, in the end, there are three centre in the social network, maybe as the time pass, three centres will become one

  3. The social network pattern is quite different between working days and weekends.