# includes: require(Peptides) require(venndir) require(tidyverse) require(cowplot) # required functions for reading crosslinks from different formats replace_leucine_isoleucine <- function(peptide) { return(gsub("[IL]", "J", peptide)) } read_from_sqlite <- function(pdResultFileName, tableName) { require(RSQLite) con <- dbConnect(RSQLite::SQLite(), pdResultFileName) read_data = dbReadTable(con, tableName) dbDisconnect(con) return(read_data) } annika_read_crosslinks <- function(pdResultFileName) { require(dplyr) read_raw = read_from_sqlite(pdResultFileName, "Crosslinks") return_df = read_raw %>% dplyr::mutate(sequence_a_ieql = gsub("[^A-Z]", "", replace_leucine_isoleucine(SequenceA))) %>% dplyr::mutate(sequence_b_ieql = gsub("[^A-Z]", "", replace_leucine_isoleucine(SequenceB))) return(return_df) } countCharOccurrences <- function(char, s) { s2 <- gsub(char, "", s) return(nchar(s) - nchar(s2)) } replace_merox_letters <- function(peptide) { #deletes non characters and replaces carbamidomethyl and oxidation return(gsub("[^A-Z]", "", gsub("m", "M", gsub("B", "C", peptide)))) } calculate_merox_peptide_mass <- function(peptide) { require(Peptides) num_B = countCharOccurrences("B", peptide) num_m = countCharOccurrences("m", peptide) return(Peptides::mw(peptide, T) + num_B * 160.03064814 + num_m * 147.03539917) } merox_read_csms <- function(zipFileName, scoreCutOff = 0, keepDeadEnds = FALSE) { require(readr) require(dplyr) unzipped = read_delim(unz(zipFileName, "Result.csv"), delim = ";", col_names = c("score", "mz", "charge", "mh_p", "calculated", "deviation_ppm", "pep_a_raw", "protein_a", "prot_a_start", "prot_a_end", "pep_b_raw", "protein_b", "prot_b_start", "prot_b_end", "offset_scan_fname", "some_boolean_1", "some_num_1", "some_num_2", "some_num_3", "some_na_1", "site_a", "site_b", "combined_info", "spec_uuid", "some_num_4", "some_na_2", "some_num_5", "pep_a_score", "pep_b_score", "uncorrected_score", "result_uuid", "some_num_6", "score_correction_multiplier" )) return_df = unzipped %>% select(-starts_with("some_")) %>% filter(score > scoreCutOff) %>% dplyr::mutate(sequence_a = replace_merox_letters(pep_a_raw)) %>% dplyr::mutate(sequence_b = replace_merox_letters(pep_b_raw)) %>% dplyr::mutate(sequence_a_ieql = replace_leucine_isoleucine(sequence_a)) %>% dplyr::mutate(sequence_b_ieql = replace_leucine_isoleucine(sequence_b)) %>% dplyr::mutate(mass_a = calculate_merox_peptide_mass(pep_a_raw)) %>% dplyr::mutate(mass_b = calculate_merox_peptide_mass(pep_b_raw)) %>% dplyr::mutate(peps_equal = sequence_a == sequence_b) %>% tidyr::extract(offset_scan_fname, c("offset", "scan", "fname"), "(?:\\(-([0-9])\\) )?([0-9]+)~([0-9a-zA-Z_\\-\\. ]+)", remove = F, convert = T) %>% tidyr::separate(protein_a, c("start_a", "prot_a", "description_a"), "\\|" ) %>% tidyr::separate(protein_b, c("start_b", "prot_b", "description_b"), "\\|" ) %>% dplyr::mutate(position_in_a = as.numeric(gsub("\\D", "", site_a))) %>% dplyr::mutate(position_in_b = as.numeric(gsub("\\D", "", site_b))) %>% dplyr::mutate(pos_in_prot_a = prot_a_start + position_in_a - 1) %>% dplyr::mutate(pos_in_prot_b = prot_b_start + position_in_b - 1) if (!keepDeadEnds) { return_df = return_df %>% dplyr::filter(pos_in_prot_a >= 0 & pos_in_prot_b >= 0) } return(return_df) } merox_read_crosslinks <- function(zipFileName, scoreCutoff = 0, keepDeadEnds = FALSE) { require(dplyr) csms = merox_read_csms(zipFileName, scoreCutoff, keepDeadEnds) return( csms %>% group_by(prot_a, pos_in_prot_a, prot_b, pos_in_prot_b) %>% top_n(1, wt = score) %>% distinct(score, prot_a, prot_b, pos_in_prot_a, pos_in_prot_b, .keep_all = T) ) } xlinkx_read_crosslinks <- function(pdResultFileName) { require(dplyr) read_raw = read_from_sqlite(pdResultFileName, "Crosslinks") return_df = read_raw %>% dplyr::mutate(sequence_a_ieql = gsub("[^A-Z]", "", replace_leucine_isoleucine(SequenceA))) %>% dplyr::mutate(sequence_b_ieql = gsub("[^A-Z]", "", replace_leucine_isoleucine(SequenceB))) return(return_df) } read_peptide_groups <- function(peptideGroupsFileName) { read_data = readr::read_csv(peptideGroupsFileName) read_data$Peptide = replace_leucine_isoleucine(read_data$Peptide) read_data = dplyr::filter(read_data, !(Peptide == "VKYVTEGMR" & Group == 10)) return(read_data) } groups_equal <- function(pepA, pepB, groupA, groupB) { if (is.na(groupA) | is.na(groupB)) { return(F) } if (groupA == groupB) { return(T) } if (pepA == "VKYVTEGMR") { return(is.element(groupB, c(2,10))) } if (pepB == "VKYVTEGMR") { return(is.element(groupA, c(2,10))) } return(F) } plink_read_crosslinks <- function(filename) { returnDf = read_csv(filename) %>% filter(grepl("-", Peptide)) %>% mutate(Peptide = gsub("\\(\\d+\\)", "", Peptide)) %>% separate(Peptide, c("sequence_a", "sequence_b"), sep="-") %>% mutate(sequence_a_ieql = replace_leucine_isoleucine(sequence_a)) %>% mutate(sequence_b_ieql = replace_leucine_isoleucine(sequence_b)) return(returnDf) } # venndir split_tibble <- function(tibble, column = 'col') { tibble %>% split(., .[,column]) %>% lapply(., function(x) x[,setdiff(names(x),column)]) } plink_1p_DSBU_raw = plink_read_crosslinks("plink/DSBU/1perFDR/reports/cas9_crapome_2021.02.08.filtered_cross-linked_peptides.csv" ) plink_1p_DSSO_raw = plink_read_crosslinks("plink/DSSO/pLink2_DSSO_1per_peplib/reports/cas9_crapome_2021.02.08.filtered_cross-linked_peptides.csv" ) plink_5p_DSBU_raw = plink_read_crosslinks("plink/DSBU/5perFDR/reports/cas9_crapome_2021.02.08.filtered_cross-linked_peptides.csv" ) plink_5p_DSSO_raw = plink_read_crosslinks("plink/DSSO/pLink2_DSSO_5per_peplib/reports/cas9_crapome_2021.02.08.filtered_cross-linked_peptides.csv" ) plink_crosslinks_raw = bind_rows( DSBU = bind_rows( "1" = plink_1p_DSBU_raw, "5" = plink_5p_DSBU_raw, .id = "FDR" ), DSSO = bind_rows( "1" = plink_1p_DSSO_raw, "5" = plink_5p_DSSO_raw, .id = "FDR" ), .id = "linker" ) annika_DSSO = annika_read_crosslinks("annika/XLpeplib_Beveridge_QEx-HFX_DSSO_stHCD_newCsmOrdering.pdResult") annika_DSBU = annika_read_crosslinks("annika/XLpeplib_Beveridge_QEx-HFX_DSBU_stHCD_newCsmOrdering.pdResult") annika_DSSO_1pFDR = annika_DSSO %>% filter(Confidence == 3) annika_DSSO_5pFDR = annika_DSSO %>% filter(Confidence >= 2) annika_DSBU_1pFDR = annika_DSBU %>% filter(Confidence == 3) annika_DSBU_5pFDR = annika_DSBU %>% filter(Confidence >= 2) rm(list=ls(pattern = "_[A-Z]{4}$")) annika_crosslinks_raw = bind_rows( list( DSBU = bind_rows(list("1" = annika_DSBU_1pFDR, "5" = annika_DSBU_5pFDR), .id = "FDR"), DSSO = bind_rows(list("1" = annika_DSSO_1pFDR, "5" = annika_DSSO_5pFDR), .id = "FDR")), .id = "linker" ) merox_DSBU_1pFDR = merox_read_crosslinks("merox/PXD014337_XLpeplib_Beveridge_Lumos_DSBU_riseup_1pFDR_onlyK.zhrm") %>% filter(score>=ifelse(prot_a == prot_b, 60.6, 79.1)) merox_DSBU_5pFDR = merox_read_crosslinks("merox/PXD014337_XLpeplib_Beveridge_Lumos_DSBU_riseup_5pFDR_onlyK.zhrm") %>% filter(score>=ifelse(prot_a == prot_b, 26.3, 79.1)) merox_DSSO_1pFDR = merox_read_crosslinks("merox/PXD014337_XLpeplib_Beveridge_Lumos_DSSO_riseup_1pFDR_onlyK.zhrm") %>% filter(score>=ifelse(prot_a == prot_b, 32.5, 32.2)) merox_DSSO_5pFDR = merox_read_crosslinks("merox/PXD014337_XLpeplib_Beveridge_Lumos_DSSO_riseup_5pFDR_onlyK.zhrm") %>% filter(score>=ifelse(prot_a == prot_b, 19.0, 30.5)) merox_crosslinks_raw = bind_rows( list( DSBU = bind_rows(list("1" = merox_DSBU_1pFDR, "5" = merox_DSBU_5pFDR), .id = "FDR"), DSSO = bind_rows(list("1" = merox_DSSO_1pFDR, "5" = merox_DSSO_5pFDR), .id = "FDR")), .id = "linker" ) xlinkx_DSBU_1pFDR_pd24 = xlinkx_read_crosslinks("xlinkx/2.4/XLpeplib_Beveridge_QEx-HFX_DSBU_stHCD_4scoreDiff_1pFDR_PD24.pdResult") xlinkx_DSBU_5pFDR_pd24 = xlinkx_read_crosslinks("xlinkx/2.4/XLpeplib_Beveridge_QEx-HFX_DSBU_stHCD_4scoreDiff_5pFDR_PD24.pdResult") xlinkx_DSSO_1pFDR_pd24 = xlinkx_read_crosslinks("xlinkx/2.4/XLpeplib_Beveridge_QEx-HFX_DSSO_stHCD_4scoreDiff_1pFDR_PD24.pdResult") xlinkx_DSSO_5pFDR_pd24 = xlinkx_read_crosslinks("xlinkx/2.4/XLpeplib_Beveridge_QEx-HFX_DSSO_stHCD_4scoreDiff_5pFDR_PD24.pdResult") xlinkx_DSBU_1pFDR_pd25 = xlinkx_read_crosslinks("xlinkx/2.5/XLpeplib_Beveridge_QEx-HFX_DSBU_stHCD_4scoreDiff_1pFDR.pdResult")%>% select(-Tags) xlinkx_DSBU_5pFDR_pd25 = xlinkx_read_crosslinks("xlinkx/2.5/XLpeplib_Beveridge_QEx-HFX_DSBU_stHCD_4scoreDiff_5pFDR.pdResult")%>% select(-Tags) xlinkx_DSSO_1pFDR_pd25 = xlinkx_read_crosslinks("xlinkx/2.5/XLpeplib_Beveridge_QEx-HFX_DSSO_stHCD_4scoreDiff_1pFDR.pdResult")%>% select(-Tags) xlinkx_DSSO_5pFDR_pd25 = xlinkx_read_crosslinks("xlinkx/2.5/XLpeplib_Beveridge_QEx-HFX_DSSO_stHCD_4scoreDiff_5pFDR.pdResult")%>% select(-Tags) xlinkx_crosslinks_raw_24and25 = bind_rows(list("PD2.4" = bind_rows( list( DSBU = bind_rows(list("1" = xlinkx_DSBU_1pFDR_pd24, "5" = xlinkx_DSBU_5pFDR_pd24), .id = "FDR"), DSSO = bind_rows(list("1" = xlinkx_DSSO_1pFDR_pd24, "5" = xlinkx_DSSO_5pFDR_pd24), .id = "FDR")), .id = "linker" ), "PD2.5" = bind_rows( list( DSBU = bind_rows(list("1" = xlinkx_DSBU_1pFDR_pd25, "5" = xlinkx_DSBU_5pFDR_pd25), .id = "FDR"), DSSO = bind_rows(list("1" = xlinkx_DSSO_1pFDR_pd25, "5" = xlinkx_DSSO_5pFDR_pd25), .id = "FDR")), .id = "linker" )), .id = "PD") rm(list=ls(pattern="_\\dpFDR_pd2[45]$")) peps_with_groups = read_peptide_groups("resources/peptide_groups.csv") %>% filter(Peptide != "VKYVTEGMR" | Group != 10) annika_crosslinks = annika_crosslinks_raw %>% filter(Decoy == 0) %>% left_join(peps_with_groups, by = c("sequence_a_ieql" = "Peptide")) %>% rename(group_a = Group) %>% left_join(peps_with_groups, by = c("sequence_b_ieql" = "Peptide")) %>% rename(group_b = Group) %>% rowwise() %>% mutate(groups_eq = groups_equal(sequence_a_ieql, sequence_b_ieql, group_a, group_b)) %>% rename(Score = BestCSMScore) %>% ungroup merox_crosslinks = merox_crosslinks_raw %>% filter(!grepl("^DEC_", start_a) & !grepl("^DEC_", start_b)) %>% # remove decoys left_join(peps_with_groups, by = c("sequence_a_ieql" = "Peptide")) %>% rename(group_a = Group) %>% left_join(peps_with_groups, by = c("sequence_b_ieql" = "Peptide")) %>% rename(group_b = Group) %>% rowwise() %>% mutate(groups_eq = groups_equal(sequence_a_ieql, sequence_b_ieql, group_a, group_b)) %>% rename(Score = score) %>% filter(nchar(sequence_a) > 0 & nchar(sequence_b) > 0) %>% ungroup xlinkx_crosslinks_24and25 = xlinkx_crosslinks_raw_24and25 %>% left_join(peps_with_groups, by = c("sequence_a_ieql" = "Peptide")) %>% rename(group_a = Group) %>% left_join(peps_with_groups, by = c("sequence_b_ieql" = "Peptide")) %>% rename(group_b = Group) %>% rowwise() %>% mutate(groups_eq = groups_equal(sequence_a_ieql, sequence_b_ieql, group_a, group_b)) %>% rename(Score = MaxXlinkXScore) %>% ungroup plink_crosslinks = plink_crosslinks_raw %>% left_join(peps_with_groups, by = c("sequence_a_ieql" = "Peptide")) %>% rename(group_a = Group) %>% left_join(peps_with_groups, by = c("sequence_b_ieql" = "Peptide")) %>% rename(group_b = Group) %>% rowwise() %>% mutate(groups_eq = groups_equal(sequence_a_ieql, sequence_b_ieql, group_a, group_b)) %>% mutate(Score = 0) %>% ungroup # BAR PLOT all_crosslinks_required_fields = bind_rows(list( "MS Annika" = annika_crosslinks %>% select(linker, FDR, Score, groups_eq), "MeroX" = merox_crosslinks %>% select(linker, FDR, Score, groups_eq), "XlinkX 2.4" = xlinkx_crosslinks_24and25 %>% filter(PD == "PD2.4") %>% select(linker, FDR, Score, groups_eq), "XlinkX 2.5" = xlinkx_crosslinks_24and25 %>% filter(PD == "PD2.5") %>% select(linker, FDR, Score, groups_eq), "pLink" = plink_crosslinks %>% select(linker, FDR, Score, groups_eq) ), .id = "Tool") %>% group_by(Tool, linker, FDR) %>% mutate(relativeScore = Score/max(Score)) counts_all_req_fields = all_crosslinks_required_fields %>% dplyr::count(Tool, linker, FDR, groups_eq) %>% arrange(desc(groups_eq)) %>% group_by(Tool, linker, FDR) %>% mutate(cumsum = cumsum(n), total = sum(n)) %>% mutate(true_fdr = n / total) barplot_data = counts_all_req_fields %>% ungroup() %>% # this adds an empty row so plink also gets a label. bind_rows(data.frame(Tool = "pLink", linker = "DSBU", FDR = "1", groups_eq = F, n = 0, cumsum =0, total = 55, true_fdr = 0 ) ) %>% mutate(true_fdr_formatted = paste0(round(true_fdr*100, 1), "%")) %>% mutate(FDR_char = paste0(FDR, "%")) %>% # mutate(Tool = factor(Tool, levels = c("MS Annika", "MeroX", "pLink", "XlinkX 2.4", "XlinkX 2.5"))) %>% mutate(tool_groups = paste(Tool, groups_eq)) # colourblind friendly color pallete: cb <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#CC79A7") red = "#D55E00" plot_for_legend_extract = barplot_data %>% filter(groups_eq & linker == "DSSO" & FDR == 1) %>% bind_rows(barplot_data %>% filter(!groups_eq) %>% head(1)) %>% rowwise() %>% mutate(tool_groups = paste0(ifelse(groups_eq, Tool, "false positive XLs"), " ", collapse = "")) %>% mutate(tool_groups = factor(tool_groups, levels = c("MS Annika ", "MeroX ", "pLink ", "XlinkX 2.4 ", "XlinkX 2.5 ", "false positive XLs "))) %>% ggplot(aes(Tool, n , fill = tool_groups)) + geom_col() + scale_fill_manual(values = c(cb[4], cb[2], cb[3], cb[6], cb[5], red)) + theme(legend.position = "bottom", legend.title = element_blank()) + guides(fill = guide_legend(nrow = 1)) + theme(legend.text = element_text(size = 12, face = "bold")) legend = cowplot::get_legend(plot_for_legend_extract) actual_plot = barplot_data %>% mutate(Tool = factor(Tool, levels = c("MS Annika", "MeroX", "pLink", "XlinkX 2.4", "XlinkX 2.5" ))) %>% ggplot(aes(Tool, n, fill = tool_groups)) + geom_col(color = "black") + geom_label(data = subset(barplot_data, !groups_eq), aes(label = true_fdr_formatted, y = total, vjust = -.2), fontface = "bold") + facet_grid(cols = vars(linker), rows = vars(FDR_char)) + scale_fill_manual(values = c(red, cb[2], red, cb[4], red, cb[3], red, cb[6], red, cb[5], red)) + theme_half_open() + background_grid() + panel_border() + theme(axis.title.x = element_blank(), axis.text.x = element_text(face = "bold")) + ylab("# unique cross-links")+ scale_y_continuous(limits = c(0, 290), sec.axis = sec_axis(trans=n~.*1, name="estimated FDR")) + theme(axis.line.y.right = element_blank(), axis.ticks.y.right = element_blank(), axis.text.y.right = element_blank()) + theme(legend.position = "none") + theme(axis.line.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank()) complete_plot = plot_grid(legend, actual_plot, ncol = 1, rel_heights = c(1, 18)) complete_plot save_plot("barplot/barplot_1And5pFDR_fixedColours.png", complete_plot, base_height = 7) # VENN DIAGRAM: all_peptides = bind_rows( "MS Annika" = annika_crosslinks %>% filter(groups_eq) %>% filter(FDR == 1) %>% select(linker, sequence_a_ieql, sequence_b_ieql), "XlinkX 2.4" = xlinkx_crosslinks_24and25%>% filter(groups_eq) %>% filter(FDR == 1) %>% filter(PD=="PD2.4") %>% select(linker, sequence_a_ieql, sequence_b_ieql), "MeroX" = merox_crosslinks %>% filter(groups_eq) %>% filter(FDR == 1) %>% select(linker, sequence_a_ieql, sequence_b_ieql), "pLink" = plink_crosslinks %>% filter(groups_eq) %>% filter(FDR == 1) %>% select(linker, sequence_a_ieql, sequence_b_ieql), .id = "tool" ) peps_for_venn = all_peptides %>% mutate(mass_a = gsub("J", "L",Peptides::mw(sequence_a_ieql, T))) %>% mutate(mass_b = gsub("J", "L",Peptides::mw(sequence_b_ieql, T))) %>% mutate(sequence_a = ifelse(mass_a% mutate(sequence_b = ifelse(mass_a% unite(sequences, sequence_a, sequence_b, remove = F) %>% select(tool, linker, sequence_a, sequence_b, sequences) tools = peps_for_venn %>% select(tool) %>% distinct() %>% unlist() linkers = peps_for_venn %>% select(linker) %>% distinct() %>% unlist() setlist_dsso = peps_for_venn %>% filter(linker == "DSSO") %>% select(tool, sequences) %>% split_tibble("tool") %>% lapply(unlist) %>% lapply(unname) setlist_dsbu = peps_for_venn %>% filter(linker == "DSBU") %>% select(tool, sequences) %>% split_tibble("tool") %>% lapply(unlist) %>% lapply(unname) venndir(setlist_dsbu, plot_style = "gg") ggsave("venn/venndir_dsbu_nonProp.png") venndir(setlist_dsso, plot_style = "gg") ggsave("venn/venndir_dsso_nonProp.png")