patchwork_analysis/patchwork_analysis.Rmd at main · squaresLab/patchwork_analysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
---
title: "Patchwork Analysis"
output: html_notebook
author: Kaia Newman
date: 04-11-2026
---


```{r}
# install required packages
# install.packages("multcomp")
# install.packages("parameters")
# install.packages("tidyverse")
# install.packages("ggplot2")
# install.packages("ARTool")
# install.packages("performance")
# install.packages("lme4", type = "source")
# install.packages("survival")
# install.packages("coxme")
# install.packages("psych")
# install.packages("lmerTest")
# install.packages("emmeans")
# install.packages("dplyr")
# install.packages("broom.mixed")
```

# Load libraries
```{r}
#load libraries
library(ggplot2)
library(dplyr)
library(lme4)
library(psych)
library(multcomp)
library(parameters)
library(performance)
library(car)
library("ARTool")
library(tidyverse)
library(survival)
library(coxme)
library(emmeans)
library(broom.mixed)
library(lmerTest)
```
# Read in analysis dataframe and preprocess columns
```{r}
# Make full df
full_df = read.csv("timing_correctness_data.csv")

# Make boolean columns
full_df$correct = ifelse(full_df$correct == "Y", TRUE, FALSE)
full_df$had_patch = (full_df$condition == "overfitting" | full_df$condition == "correct")
# P2 t3 was cut off 5 minutes early
# make column for that, add to model, remove if not explanatory of much variance
full_df$cut_off = FALSE
full_df$cut_off[full_df$PID == 'P2' & full_df$task_no == 3] = TRUE

# Make other random effects factors
full_df$PID = as.factor(full_df$PID)
full_df$bug = as.factor(full_df$bug)
full_df$condition = as.factor(full_df$condition)

# make numPrev column
full_df$num_prev = full_df$task_no - 1
full_df$num_prev = as.factor(full_df$num_prev)

# For IDK in would_submit category, turn to NA
full_df$would_submit[full_df$would_submit == 'IDK'] = NA
# Also, blanks got read in as "" instead of NA
full_df$would_submit[full_df$would_submit == ""] = NA
full_df$think_patch_correct[full_df$think_patch_correct == ""] = NA
# Combine Y/P for self-reported data since they have the same valence
full_df$think_correct[full_df$think_correct == 'Y' | full_df$think_correct == 'P'] = 'YP'
full_df$think_patch_correct[full_df$think_patch_correct == 'Y' | full_df$think_patch_correct == 'P'] = 'YP'
full_df$would_submit[full_df$would_submit == 'Y' | full_df$would_submit == 'P'] = 'YP'

# Make columns for correct patch/fix judgments
full_df$accurate_about_correctness = (full_df$correct == TRUE & full_df$think_correct == 'YP') |
                                      (full_df$correct == FALSE & full_df$think_correct == 'N')

# Make status column for survival analysis
full_df$status = ifelse(full_df$time_minutes < 25, TRUE, FALSE)
```

# Prep data from survey to put in analysis dataframe
```{r}
# Add columns for in-study survey per participant-task
survey = read.csv("patchwork_survey.csv")

# Find duplicate PIDs and rename to P1-0 through P6-0
which(tolower(survey$Q24) == "p1")
survey$Q24[4] = "P1_0"
survey$Q24[5] = "P2_0"
survey$Q24[6] = "P3_0"
survey$Q24[7] = "P4_0"
survey$Q24[8] = "P5_0"
survey$Q24[9] = "P6_0"

# Q24 -- PID
# Q1_10 -- mental/perceptual activity
# Q2_1 -- time pressure
# Q3_1 -- performance
# Q4_1 -- effort
# Q5_1 -- frustration

# and so on for t2:
# Q44_10 -- mental/perceptual activity
# Q45_1 -- time pressure
# ...

# and for t3:
# Q50_10 -- mental/perceptual activity
# Q51_1 -- time pressure
# ...

# Create average cognitive load for t1, 2, and 3
data.frame(index = seq_along(survey), name = names(survey))
survey$Q24 = as.factor(survey$Q24)
# some people did not drag the scale and meant 0 -- change NaNs to 0. more accurate to what really happened than missing data
fill = function(x) replace(as.numeric(x), is.na(as.numeric(x)), 0)
survey[, 20:24] = lapply(survey[, 20:24], fill)
survey[, 26:30] = lapply(survey[, 26:30], fill)
survey[, 32:36] = lapply(survey[, 32:36], fill)


# level of performance (Q3_1, Q46_1, Q52_1) should be inverse
survey$Q3_1 = 20 - survey$Q3_1
survey$Q46_1 = 20 - survey$Q46_1
survey$Q52_1 = 20 - survey$Q52_1

survey$t1_cognitive_load = rowMeans(survey[, 20:24])
survey$t2_cognitive_load = rowMeans(survey[, 26:30])
survey$t3_cognitive_load = rowMeans(survey[, 32:36])

# Now set ONE column in timing_data which has these averages, matching with PID and task number
lookup = as.matrix(survey[, c("t1_cognitive_load", "t2_cognitive_load", "t3_cognitive_load")])
rownames(lookup) = survey$Q24

full_df$cognitive_load = lookup[
  cbind(
    match(full_df$PID, rownames(lookup)),
    full_df$task_no
  )
]

# Chop off header in survey
survey = survey[4:nrow(survey), ]

# rename column to make easier to remember as predictor
survey = rename(survey, PID = Q24)

# Subset to self-efficacy. columns Q2_1.1, Q2_2--8
cols = c("Q2_1.1", "Q2_2", "Q2_3", "Q2_4", "Q2_5", "Q2_6", "Q2_7", "Q2_8")
se = survey[, cols]
# See all unique values across all your SE columns
lapply(se[cols], unique)
# Numericize and average
efficacy_map = c("Strongly Disagree" = 1,
                 "Somewhat disagree" = 2,
                 "Neither agree nor disagree" = 3,
                 "Somewhat agree" = 4,
                 "Strongly agree" = 5)
se[cols] = lapply(se[cols], function(x) efficacy_map[x])
survey$self_efficacy = rowMeans(se[cols], na.rm = TRUE)
# Add column to timing_data for self_efficacy
full_df = merge(full_df,
                    survey[, c("PID", "self_efficacy")],
                    by = "PID",
                    all.x = TRUE)
```

# Prep data from screener to put in analysis dataframe
```{r}
screener = read.csv("patchwork_screener.csv")

# Chop off header in screener
screener = screener[3:nrow(screener), ]

# rename column to make easier to remember as predictor
screener = rename(screener, professional_YOE = Q11)
screener$professional_YOE = as.numeric(screener$professional_YOE)
screener$PID = as.factor(screener$PID)

# Singleton category is problematic, so we'll say unemployed participant is recently in their professional career
screener$professional_identity = ifelse(
  screener$professional_identity == "Unemployed",
  "Professional",
  screener$professional_identity
)
# Also combine to "grad student"
screener$professional_identity = ifelse(
  screener$professional_identity == "PhD" | screener$professional_identity == "Masters",
  "Grad",
  screener$professional_identity
)
screener$professional_identity = as.factor(screener$professional_identity)

summary(screener$professional_identity)
table(screener$professional_identity, screener$Q13)
```

# Demographics of participants and bug difficulty
```{r}
# Find demographics: min max median age and YOE
screener$Age = as.numeric(screener$Age)
paste0("Median age: ", median(screener$Age, na.rm=TRUE))
paste0("Minimum age: ", min(screener$Age, na.rm=TRUE))
paste0("Maximum age: ", max(screener$Age, na.rm=TRUE))

paste0("Median YOE: ", median(screener$professional_YOE, na.rm=TRUE))
paste0("Minimum YOE: ", min(screener$professional_YOE, na.rm=TRUE))
paste0("Maximum YOE: ", max(screener$professional_YOE, na.rm=TRUE))

# Q13 is gender, Q16 is are you a student (answers: Yes, I am a Master's student, Yes, I am a Ph.D. student, No, Yes, I am undergraduate), Q10 is have you worked or are you currently working... with "Yes, I currently work at one" being the professional label
as.data.frame(table(screener$Q13))

# Which bugs were able to be debugged correctly without a patch + at what rate?
# goal: make table that lists bug, num correct about solution in control group, total in control group, percent. for each bug
bug_summary <- full_df %>%
  filter(condition == "control") %>%          # keep only control group
  group_by(bug) %>%
  summarise(
    num_correct = sum(correct, na.rm = TRUE),
    total       = n(),
    percent     = round(num_correct / total * 100, 1)
  ) %>%
  arrange(bug)

bug_summary

cognitive_load_ranking <- full_df %>%
  filter(condition == "control") %>%
  group_by(bug) %>%
  summarise(avg_cognitive_load = round(mean(cognitive_load, na.rm = TRUE), 2)) %>%
  arrange(desc(avg_cognitive_load))

cognitive_load_ranking

cognitive_load_ranking_full <- full_df %>%
  group_by(bug) %>%
  summarise(avg_cognitive_load = round(mean(cognitive_load, na.rm = TRUE), 2)) %>%
  arrange(desc(avg_cognitive_load))

cognitive_load_ranking_full

bug_summary_full <- full_df %>%
  group_by(bug) %>%
  summarise(
    num_correct = sum(correct, na.rm = TRUE),
    total       = n(),
    percent     = round(num_correct / total * 100, 1)
  ) %>%
  arrange(bug)

bug_summary_full
```

# RQ1a: Just run the models from APR, What is it Good For? (ICSE 2024)
```{r}
# Note: these are not our final models for this RQ, and were not derived in a principled way. We're just seeing how the models from the paper fare for us.
correctness_model = glmer(correct ~ condition + (1 |PID) + (1 | bug), data = full_df, family = binomial)

summary(correctness_model)

timing_model = lmer(time_minutes ~ condition + num_prev + (1 | PID) + (1 | bug), data = full_df)

summary(timing_model)

# Redo significance testing
k1 <- glht(timing_model, mcp(condition="Tukey"))$linfct
k2 <- glht(timing_model, mcp(num_prev="Tukey"))$linfct
posthoc <- glht(timing_model, linfct = rbind(k1,k2))
summary(posthoc)
model_parameters(timing_model)

r2(timing_model)
Anova(timing_model)
```

# Check predictors for collinearity issues and merge
```{r}
# Q20 is IntelliJ IDEA IDE experience and Q21 is Java
# Subset screener to IntelliJ/Java experience
exp_vars = screener[, c("Q20", "Q21")]

exp_vars$Q20 = ifelse(screener$Q20 == "Not familiar at all",  1,
                      ifelse(screener$Q20 == "Slightly familiar",     2,
                             ifelse(screener$Q20 == "Moderately familiar",   3,
                                    ifelse(screener$Q20 == "Very familiar",         4,
                                           ifelse(screener$Q20 == "Extremely familiar",    5, NA)))))

exp_vars$Q21 = ifelse(screener$Q21 == "Not knowledgeable at all",  1,
                      ifelse(screener$Q21 == "Slightly knowledgeable",     2,
                             ifelse(screener$Q21 == "Moderately knowledgeable",   3,
                                    ifelse(screener$Q21 == "Very knowledgeable",         4,
                                           ifelse(screener$Q21 == "Extremely knowledgeable",    5, NA)))))

# estimates the correlation between two ordinal variables
polychoric(exp_vars)

# Average into single component, since they covary highly enough
screener$java_intellij_experience = rowMeans(exp_vars, na.rm = TRUE)

# if number is low, we can count years of experience and java_intellij_experience as different predictors, not impacting model collinearity
cor(screener$professional_YOE, screener$java_intellij_experience,
    use = "complete.obs", method = "spearman")

# Look at correspondence between professional identity and YOE
# Boxplot: Years of Experience by Professional Identity
boxplot(professional_YOE ~ professional_identity, data = screener,
        main = "YOE by Professional Identity",
        xlab = "Professional Identity",
        ylab = "Years of Experience")

# Means of YOE within each category
# PhD and Masters have similar average YOE, but have different professional identities, so it may be valuable to include this as a predictor
aggregate(professional_YOE ~ professional_identity,
          data = screener,
          FUN = mean)

# Add column to full_df for professional YOE, java_intellij_experience, professional identity
full_df = merge(full_df,
                    screener[, c("PID", "java_intellij_experience", "professional_YOE", "professional_identity")],
                    by = "PID",
                    all.x = TRUE)
```

# Subset dataframes for relevant analyses
```{r}
# FIXME: not necessarily required to look at source code, should find a better thing to subset on
gaze_valid_df = full_df[!is.na(full_df$Source.Code_fixation_count), ]
# Drop P1t1 as well due to manual inspection (following best practices for ascertaining gaze data quality)
gaze_valid_df = gaze_valid_df[!(gaze_valid_df$PID == "P1" & gaze_valid_df$task_no == 1), ]
gaze_valid_df = droplevels(gaze_valid_df)

# boolean for whether they looked at the buggy method
gaze_valid_df$looked_at_buggy_method = ifelse(is.na(gaze_valid_df$ttff_buggy_method), FALSE, TRUE)
# Make status and response columns for survival analysis
gaze_valid_df$fl_status = ifelse(gaze_valid_df$ttff_buggy_method < 25 & !is.na(gaze_valid_df$ttff_buggy_method), TRUE, FALSE)
gaze_valid_df$ttff_fl_for_model = ifelse(
  is.na(gaze_valid_df$ttff_buggy_method),
  25,  # censored at 25 minutes
  pmin(gaze_valid_df$ttff_buggy_method, 25)  # in rare event they found in between warning me and me walking over
)

relevant_aoi_duration_cols = c("Test.and.Runtime.Feedback_fixation_duration", "Tests_fixation_duration", "Source.Code_fixation_duration", "Browser_fixation_duration")

relevant_patch_aoi_duration_cols = c("Test.and.Runtime.Feedback_fixation_duration", "Tests_fixation_duration", "Source.Code_fixation_duration", "Browser_fixation_duration", "Patch_fixation_duration")

# fill 0s for NAs in relevant AOI duration columns - they spent no time looking at these
gaze_valid_df[relevant_aoi_duration_cols][is.na(gaze_valid_df[relevant_aoi_duration_cols])] = 0

relevant_aoi_prop_cols = paste0("prop_", relevant_aoi_duration_cols)
relevant_patch_aoi_prop_cols = paste0("prop_", relevant_patch_aoi_duration_cols)

# Create columns for the total fixation duration, then columns for duration proportion per AOI
gaze_valid_df$total_fixation_duration = rowSums(gaze_valid_df[, relevant_aoi_duration_cols])
gaze_valid_df[, relevant_aoi_prop_cols] = gaze_valid_df[, relevant_aoi_duration_cols] / gaze_valid_df$total_fixation_duration

full_df_had_patch = full_df[full_df$had_patch, ]
# relevel bc no control
full_df_had_patch = droplevels(full_df_had_patch)

full_df_had_patch$accurate_about_patch_correctness = (full_df_had_patch$condition == "correct" & full_df_had_patch$think_patch_correct == 'YP') |
                                      (full_df_had_patch$condition == "overfitting" & full_df_had_patch$think_patch_correct == 'N')
full_df_had_patch$accurate_about_patch_correctness = ifelse(is.na(full_df_had_patch$think_patch_correct), NA, full_df_had_patch$accurate_about_patch_correctness)

gaze_valid_had_patch = full_df_had_patch[!is.na(full_df_had_patch$Source.Code_fixation_count), ]
gaze_valid_had_patch = droplevels(gaze_valid_had_patch)
gaze_valid_had_patch[relevant_patch_aoi_duration_cols][is.na(gaze_valid_had_patch[relevant_patch_aoi_duration_cols])] = 0
gaze_valid_had_patch$total_fixation_duration = rowSums(gaze_valid_had_patch[, relevant_patch_aoi_duration_cols])
gaze_valid_had_patch[, relevant_patch_aoi_prop_cols] = gaze_valid_had_patch[, relevant_patch_aoi_duration_cols] / gaze_valid_had_patch$total_fixation_duration
```

# RQ1a: How do patch suggestions affect time and accuracy of debugging sessions? (Section FIXME)
```{r}
# And now, as determined by the DAG, participant insights, and prior literature, we define the maximal model
# FIXED EFFECTS:
# key players: treatment condition, professional YOE. these are the ones we'll try nested models for in LRT.
# intellij/java experience
# self-efficacy scores
# number of previous tasks, looking for ordering effects

# Note: my intuition is that professional YOE/professional identity are too related as constructs for what we're trying to measure (latent expertise) to include both
# in the same model

# We would include treatment condition * professional YOE (hypothesis from participants: professionals might be less affected by wrong patches) and some other interaction effects, but we don't really have enough samples to be able to do this

# RANDOM EFFECTS:
# bug, PID
# optimizer maybe helps the convergence some because it goes for more iterations

# No clear difference between control and overfitting for correctness
table(full_df$correct, full_df$condition)

full_correctness_model = glmer(correct ~ condition + professional_YOE
                              + java_intellij_experience + self_efficacy + num_prev
                              + (1 | PID) + (1 | bug),
                              data = full_df,
                              family = binomial)

# Do LRT to see whether condition, professional_YOE, and professional_YOE:condition improve the model
correctness_model_no_condition = glmer(correct ~ professional_YOE
                              + java_intellij_experience + self_efficacy + num_prev
                              + (1 | PID) + (1 | bug),
                              data = full_df,
                              family = binomial)
condition_LRT = anova(full_correctness_model, correctness_model_no_condition)

correctness_model_no_YOE = glmer(correct ~ condition
                              + java_intellij_experience + self_efficacy + num_prev
                              + (1 | PID) + (1 | bug),
                              data = full_df,
                              family = binomial)
YOE_LRT = anova(full_correctness_model, correctness_model_no_YOE)

# BH-correct for multiple comparisons in LRTs and present results
p_vals = c(
  condition    = condition_LRT$`Pr(>Chisq)`[2],
  YOE          = YOE_LRT$`Pr(>Chisq)`[2]
)

p_adjusted = p.adjust(p_vals, method = "BH")

# print both together for easy comparison
data.frame(
  predictor = names(p_vals),
  p_raw     = p_vals,
  p_adj     = p_adjusted
)

# LRT results: YOE does not improve the model
# Summarize the best model to report
summary(correctness_model_no_YOE)
tidy(correctness_model_no_YOE, exponentiate = TRUE, conf.int = TRUE)
table(full_df$condition, full_df$correct)
emmeans(correctness_model_no_YOE, ~ condition, type = "response")

# Overall results:
# Correct patch suggestions predict solution correctness
# No evidence for overfitting patches making things worse

# Summary stats
summary(full_df$time_minutes)

# Look at distribution shape
hist(full_df$time_minutes, breaks = 30)

full_timing_model = glmer(time_minutes ~ condition
                              + java_intellij_experience + self_efficacy + num_prev
                              + (1 | PID) + (1 | bug),
                              data = full_df,
                              family = Gamma(link="log"))

# Check residuals because of skewness
residuals = resid(full_timing_model)
hist(residuals)
paste0("This is the skewness of the residuals: ", skewness(residuals))

# not normal! survival analysis to see how treatment condition affected response time
survivor = Surv(time = full_df$time_minutes, event = full_df$status)
# addition: do analysis with event being time + got correct

survdiff(survivor ~ condition, data = full_df)

# fit KM curve
km_fit = survfit(survivor ~ condition, data = full_df)
summary(km_fit)

cb_colors <- c("#0072B2", "#E69F00", "#000000")  # blue, orange, black

plot(km_fit,
     col = cb_colors,
     lty = 1, lwd = 2,
     xlab = "Time (minutes)",
     ylab = "Probability of failing to produce response",
     main = "Response time by condition")

legend("topright",
       legend = c("control", "correct", "overfitting"),
       col = cb_colors,
       lty = 1, lwd = 2)

# Now, CPH model to see what predictors might be responsible for the differences
cox_time_full = coxme(survivor ~ condition + professional_YOE
                              + java_intellij_experience + self_efficacy + num_prev
                              + (1 | PID) + (1 | bug),
                              data = full_df)
cox_time_no_condition = coxme(survivor ~ professional_YOE
                              + java_intellij_experience + self_efficacy + num_prev
                              + (1 | PID) + (1 | bug),
                              data = full_df)
condition_LRT = anova(cox_time_full, cox_time_no_condition)

cox_time_no_YOE = coxme(survivor ~ condition
                              + java_intellij_experience + self_efficacy + num_prev
                              + (1 | PID) + (1 | bug),
                              data = full_df)
YOE_LRT = anova(cox_time_full, cox_time_no_YOE)

# BH-correct for multiple comparisons in LRTs and present results
p_vals = c(
  condition    = condition_LRT$`P(>|Chi|)`[2],
  YOE          = YOE_LRT$`P(>|Chi|)`[2]
)

p_adjusted = p.adjust(p_vals, method = "BH")

# print both together for easy comparison
data.frame(
  predictor = names(p_vals),
  p_raw     = p_vals,
  p_adj     = p_adjusted
)

summary(cox_time_no_YOE)

# TODO: Are professionals more likely to edit the patch they were given? Are students more likely to overfit? (passes tests x is incorrect for a new column of “overfitting_fix”)
```

# RQ1b: How do developer self-reports measure up to behavioral outcomes?
```{r}
# Tables to make: think correct vs. actually correct, think patch correct vs. patch condition, would submit vs. actually correct
correct_tab = table(full_df$correct, full_df$think_correct)
patch_correct_tab = table(full_df_had_patch$condition, full_df_had_patch$think_patch_correct)
submit_correct_tab = table(full_df$would_submit, full_df$correct)

as.data.frame.matrix(correct_tab)
as.data.frame.matrix(patch_correct_tab)
as.data.frame.matrix(submit_correct_tab)

# set seed so results are deterministic
set.seed(20260622)
# sparse, so we simulate
test_patch = chisq.test(patch_correct_tab, simulate.p.value = TRUE, B = 10000)
test_correct = chisq.test(correct_tab, simulate.p.value = TRUE, B = 10000)
test_submit = chisq.test(submit_correct_tab, simulate.p.value = TRUE, B = 10000)

test_patch
test_correct
test_submit

# Standardized residuals (anything above 2 or below -2 is notable)
test_patch$stdres
test_correct$stdres
test_submit$stdres

# Notably, participants are good at detecting if patches are correct and if their own fixes are, too

# We would like to see if professionals are better at determining the correctness of their fixes and the patch
professional_correct_tab = table(full_df$professional_identity, full_df$accurate_about_correctness)
professional_patch_correct_tab = table(full_df_had_patch$professional_identity, full_df_had_patch$accurate_about_patch_correctness)
professional_correct_tab
professional_patch_correct_tab

test_prof_accuracy = chisq.test(professional_correct_tab, simulate.p.value = TRUE, B = 10000)
test_prof_patch_accuracy = chisq.test(professional_patch_correct_tab, simulate.p.value = TRUE, B = 10000)

test_prof_accuracy
test_prof_patch_accuracy

test_prof_accuracy$stdres
test_prof_patch_accuracy$stdres

# test also YOE
wilcox.test(professional_YOE ~ accurate_about_patch_correctness,
            data = full_df_had_patch)
wilcox.test(professional_YOE ~ accurate_about_correctness,
            data = full_df)
# Wilcoxon rank-sum test

aggregate(professional_YOE ~ accurate_about_patch_correctness, data = full_df_had_patch, FUN = median)

# Verdict: we do not have significant evidence to suggest this is the case.
# However, trend for accurate about patch correctness
# 127 total judgments, 80 for patch, 125 submit

# I am curious about the alignment of their correctness/submission judgments - obv these are going to be highly correlated but I wonder how many "no I won't submit" there are for ones they think are correct.
table(full_df$think_correct, full_df$would_submit)

# 20 were NQ or N...interesting

```

# RQ4: How do patch suggestions influence cognitive load during debugging? (Section FIXME)
```{r}
# High level plan: do subjective reports from NASA-TLX, and then see if eye tracking metrics agree with the self-reports

# do model with self-reported predictors here
cognitive_load_model = lmer(cognitive_load ~ condition + professional_YOE
                              + java_intellij_experience + num_prev
                              + (1 | PID) + (1 | bug),
                              data = full_df)

summary(cognitive_load_model)

# Gaze metrics related to cognitive load: average fixation duration
hist(gaze_valid_df$avg_fixation_duration)
# looks mostly normal to me, can use regular lmer
fix_duration_model = lmer(avg_fixation_duration ~ condition + (1|PID) + (1|bug),
                          data = gaze_valid_df)
summary(fix_duration_model)

# Additionally, AFD is likely affected by when they looked at the patch first/how long they spent debugging unassisted
# So we can do an analysis separately
# condition not expected to matter here
fix_duration_patch_model = lmer(avg_fixation_duration ~ condition + ttff_patch + (1|PID) + (1|bug),
                          data = gaze_valid_had_patch)
summary(fix_duration_patch_model)
```

# Testing hypotheses for RQ3: What debugging strategies and behaviors distinguish successful from unsuccessful debugging attempts? pt. 1 (Section FIXME)
# Hypotheses derived from interview data
```{r}
# Demonstrate why we can't include PID as a random effect for these analyses (only one sample per participant for some, so not estimable)
table(gaze_valid_had_patch$condition, gaze_valid_had_patch$PID)

ttff_patch_valid_df = gaze_valid_had_patch[!is.na(gaze_valid_had_patch$ttff_patch), ]

# Hypothesis: people who look at the patch early on may be anchored to the patch's solution and may finish the task faster/more correctly (dependent on condition)
ttff_patch_model = glmer(correct ~ ttff_patch + (1|bug),
                 data = ttff_patch_valid_df, family = binomial)
summary(ttff_patch_model)

m0 = glmer(correct ~ condition + (1|bug), data = ttff_patch_valid_df, family = binomial)
summary(m0)
m1 = glmer(correct ~ condition + ttff_patch + (1|bug),
                 data = ttff_patch_valid_df, family = binomial)
LRT1 = anova(m0, m1)
LRT1
# Not significant and ttff_patch does not improve the model above condition -- but overfitting patches predict incorrect fixes with correct patches as the comparison level

# TODO: maybe include time-dependent covariate, convert df into long format to test its effects on response rate
# ttff_patch_cox = coxme(Surv(time_minutes, status) ~ condition + ttff_patch + (1 | bug), data = gaze_valid_had_patch)
# summary(ttff_patch_cox)

# Hypothesis: Developers report triangulating multiple sources of information (tests, test output, code, patch, web-search) to debug, and prior work supports the idea that higher attention switching behavior correlates with effort and the accuracy of proof correction/problem-solving. Developers who exhibit higher attention switching behavior between AOIs may be more performant.
attention_switch_model = glmer(correct ~ attention_switching_rate + (1|PID) + (1|bug),
                 data = gaze_valid_df, family = binomial)
summary(attention_switch_model)

attention_switch_cox = coxme(Surv(time_minutes, status) ~ attention_switching_rate + (1|PID) + (1 | bug), data = gaze_valid_df)
summary(attention_switch_cox)
```
# Testing hypotheses for RQ3: What debugging strategies and behaviors distinguish successful from unsuccessful debugging attempts? pt. 2 (Section FIXME)
```{r}
table(gaze_valid_df$condition, gaze_valid_df$correct)
# Large separation with correctness and condition. In other words, almost all of the variance in correctness is explained by condition (explain table)

# Correctness: patch AOI (we are interested in interaction effect)
# needed to remove random effect of bug because it was not estimable in interaction term and we want the models to be comparable
correct_patch_model = glm(correct ~ prop_Patch_fixation_duration, data = gaze_valid_had_patch, family = binomial)
summary(correct_patch_model)

m0 = glm(correct ~ condition, data = gaze_valid_had_patch, family = binomial)
m1 = glm(correct ~ condition + prop_Patch_fixation_duration, data = gaze_valid_had_patch, family = binomial)
LRT1 = anova(m0, m1, test = "Chisq")
LRT1
# does not improve

# Because we have strong separation between correctness of fix and treatment condition among these data, let's just look at the numbers for a trend here (this is a limitation)
gaze_valid_had_patch %>%
  group_by(condition, correct) %>%
  summarise(n = n(),
            mean_prop = mean(prop_Patch_fixation_duration),
            min_prop = min(prop_Patch_fixation_duration),
            max_prop = max(prop_Patch_fixation_duration))
library(ggplot2)
ggplot(gaze_valid_had_patch, aes(x = prop_Patch_fixation_duration, y = correct, color = condition)) +
  geom_point() +
  facet_wrap(~condition)

# Timing: non-patch AOIs
p_values_time = c()
for (col in relevant_aoi_prop_cols) {
  formula = as.formula(paste("Surv(time_minutes, status) ~", col, "+ condition + (1|PID) + (1|bug)"))
  time_model = coxme(formula, data = gaze_valid_df)
  summary(time_model)

  # save p-value for multcomp :3c
  coef_table = summary(time_model)$coefficients
  print(coef_table)
  p_values_time[col] = coef_table[col, "p"]
}

p_adjusted_time = p.adjust(p_values_time, method = "BH")

data.frame(
  aoi = relevant_aoi_prop_cols,
  p_raw = p_values_time,
  p_adjusted = p_adjusted_time
)

survival_feedback_dur = coxme(Surv(time = time_minutes, event = status) ~ prop_Test.and.Runtime.Feedback_fixation_duration + condition + (1|PID) + (1|bug), data = gaze_valid_df)
summary(survival_feedback_dur)

# spending more of your time looking at test/runtime feedback is associated with a significantly higher hazard of responding (controlling for condition)
cor(gaze_valid_df$prop_Test.and.Runtime.Feedback_fixation_duration, gaze_valid_df$time_minutes, use = "complete.obs")
summary(aov(prop_Test.and.Runtime.Feedback_fixation_duration ~ condition, data = gaze_valid_df))
# not correlated strongly with time as an outcome and condition does not predict it, so it independently is predictive of response hazard

# Find IQR-scaled HR, since the ratio is meaningless for a proportion (increase in 1 is not possible)
IQR_val <- IQR(gaze_valid_df$prop_Test.and.Runtime.Feedback_fixation_duration, na.rm = TRUE)
exp(4.2866 * IQR_val)

# spending more of your time looking at the patch is associated with a significantly higher hazard of responding (controlling for condition)
survival_patch_dur = coxme(Surv(time = time_minutes, event = status) ~ prop_Patch_fixation_duration + condition + (1|bug), data = gaze_valid_had_patch)
summary(survival_patch_dur)

cor(gaze_valid_had_patch$prop_Patch_fixation_duration, gaze_valid_had_patch$time_minutes, use = "complete.obs")
summary(aov(prop_Patch_fixation_duration ~ condition, data = gaze_valid_had_patch))
# condition does correlate with proportion of fixation duration on patch -- this makes this result much less interpretable/less trustworthy
```

# RQ2: How do patch suggestions influence developers' debugging strategies and behaviors? (Section FIXME) pt. 1
```{r}
# Hypothesis: Overwhelmingly, developers report having access to a patch speeding up fault localization or making it possible for them. When developers have access to a patch, do they find the location of the bug faster?
# ttff_fl_for_model should be 25 if NA since they "survived"/didn't find the bug
fl_cox_model = coxme(Surv(ttff_fl_for_model, fl_status) ~ condition + (1 | PID) + (1 | bug), data = gaze_valid_df)
summary(fl_cox_model)
# yea! people found the bug substantially faster when they had a patch

# see trend if patch also helped people find the bug with chart12 (which had wrong FL)
chart12_gaze_valid = gaze_valid_df[gaze_valid_df$bug == "chart12", ]
chart12_gaze_valid = droplevels(chart12_gaze_valid)
nrow(chart12_gaze_valid)
# 26 items

# Check n per condition, and how many fixated vs. never fixated on buggy method
table(chart12_gaze_valid$condition)
tapply(!is.na(chart12_gaze_valid$ttff_buggy_method), chart12_gaze_valid$condition, sum)

tapply(chart12_gaze_valid$ttff_buggy_method, chart12_gaze_valid$condition, mean, na.rm = TRUE)
tapply(chart12_gaze_valid$ttff_buggy_method, chart12_gaze_valid$condition, median, na.rm = TRUE)
# this pattern is directionally consistent with the overall finding (but with small n, take with grain of salt and need to study more)


# Hypothesis: Along similar lines, developers also report having access to a patch narrowing their focus. When developers have access to a patch, do they look at a more narrow slice of a codebase or have a "pointier" distribution of attention?

# an offset just makes the coefficient 1, so creates assumption where there is a 1:1 relationship between the amount of time spent on task and the number of files you look at
num_files_m = glmer(num_files_looked_at ~ condition + offset(log(time_minutes)) + (1|PID) + (1|bug),
           data = gaze_valid_df, family = poisson)
summary(num_files_m)

entropy_m = lmer(fixation_duration_entropy ~ condition + (1|PID) + (1|bug),
                 data = gaze_valid_df)
summary(entropy_m)
```

# RQ2: How do patch suggestions influence developers' debugging strategies and behaviors? (Section FIXME) pt. 2
```{r}
# We would like to see if distribution of attention changed over AOIs we care about with the addition of a patch
gaze_valid_long <- gaze_valid_df %>%
  dplyr::select(PID, bug, condition, had_patch, all_of(relevant_aoi_prop_cols)) %>%
  pivot_longer(
    cols = all_of(relevant_aoi_prop_cols),
    names_to  = "AOI",
    values_to = "prop_fixation"
  ) %>%
  mutate(
    # Strip leading "prop_" and trailing "_fixation_duration"
    AOI = str_remove(AOI, "^prop_"),
    AOI = str_remove(AOI, "_fixation_duration$"),
    # Clean up dots in names like "Test.and.Runtime.Feedback" --> "Test and Runtime Feedback"
    AOI = str_replace_all(AOI, "\\.", " "),
    # Convert to factors
    PID       = factor(PID),
    bug       = factor(bug),
    condition = factor(condition),
    AOI       = factor(AOI)
  )

gaze_valid_long

gaze_valid_long %>%
  group_by(had_patch, AOI) %>%
  summarise(
    mean_prop = mean(prop_fixation, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  ggplot(aes(x = had_patch, y = mean_prop, fill = AOI)) +
  geom_col(position = "stack") +
  labs(y = "Mean proportion of fixation duration", x = "Condition", fill = "AOI") +
  theme_minimal()

m = art(prop_fixation ~ had_patch * AOI,
         data = gaze_valid_long)
anova(m)

m2 = art(prop_fixation ~ condition * AOI,
         data = gaze_valid_long)
anova(m2)

# we observe no difference in fixation duration distribution over non-patch AOIs between conditions

# Do people spend more time looking at the patch in either condition?
patch_dur_model = lmer(prop_Patch_fixation_duration ~ condition + (1|bug), data = gaze_valid_had_patch)
summary(patch_dur_model)

emm = emmeans(patch_dur_model, ~ condition)
emm
eff_size(emm, sigma = sigma(patch_dur_model), edf = df.residual(patch_dur_model))

aggregate(prop_Patch_fixation_duration ~ condition, data = gaze_valid_had_patch, FUN = mean)
# diff is 3.1% of fixation time, relative reduction of 40%

hist(gaze_valid_had_patch$ttff_patch, breaks = 30)
gaze_valid_had_patch$prop_ttff_patch_over_time = (gaze_valid_had_patch$ttff_patch / gaze_valid_had_patch$time_minutes) * 100
hist(gaze_valid_had_patch$prop_ttff_patch_over_time, breaks = 15)

h <- hist(gaze_valid_had_patch$prop_ttff_patch_over_time, breaks = seq(0, 100, by = 5), plot = FALSE)

pdf("fixation_histogram.pdf", width = 7, height = 4)
hist(gaze_valid_had_patch$prop_ttff_patch_over_time,
     breaks = seq(0, 100, by = 5),
     col = "goldenrod",
     border ="black",
     xlab = "Percentage of Task Time Elapsed Before First Fixation on Patch",
     ylab = "Frequency",
     ylim = c(0, max(h$counts) + 5),
     main = "Timing of First Fixation on Patch",
     xaxt = "n")
axis(1, at = seq(0, 100, by = 20), labels = paste0(seq(0, 100, by = 20), "%"))
dev.off()

gaze_valid_had_patch$ttff_buggy_method_prop_time = (gaze_valid_had_patch$ttff_buggy_method / gaze_valid_had_patch$time_minutes) * 100
summary(gaze_valid_had_patch$ttff_buggy_method_prop_time)

screener
```