Old code refurbishment！

6 minute read

Introduction

Here are the general requirements for writing code:

Code Readability:
- Use meaningful variable and function names so that others can understand what the code does.
- Utilize comments appropriately to explain complex algorithms or logic.
- Follow style guidelines such as indentation and spacing to keep the code neat and consistent.
Modular Design:
- Break code down into small, reusable functions or modules, with each module responsible for a specific task.
- Avoid duplicate code by using functions or classes to reuse logic.
Error Handling:
- Implement error handling mechanisms to ensure that the code can gracefully handle exceptions rather than crashing.
- Use exception handling to catch and manage potential errors.
Performance Optimization:
- For applications with high performance requirements, pay attention to the time complexity and space complexity of algorithms.
- Collect and analyze performance data, and optimize as necessary.
Testing:
- Write unit tests to ensure the correctness of each functional module.
- Use testing frameworks to automate the testing process and ensure that functionality is not broken after code changes.
Documentation:
- Write project documentation to explain how to use, set up, and understand the functionalities of the code.
- Provide API documentation to help developers understand how to utilize your codebase.
Version Control:
- Use version control tools (such as Git) to manage the history of code changes.
- Make clear commits with meaningful messages.
Adhering to Best Practices:
- Always keep an eye on the best practices for the language and framework you are using, updating code to conform to the latest standards.
- Engage in code reviews to learn from colleagues and improve code quality.

Example One: Venn Diagram

Detailed content Click

modified code

#!/usr/bin/env Rscript
# -*- coding: utf-8 -*-
# @Author    : mengqingyao
# @Time      : 20241029
# @File      : veen.r

library(readr)
library(tidyverse)
library(ggVennDiagram)
library(R6)

# 创建一个R6类
VennDiagramAnalysis <- R6Class("VennDiagramAnalysis",
  public = list(
    orthogroups_file = NULL,
    unassigned_genes_file = NULL,
    output_prefix = "venn_diagram",  # 输出前缀
    plot_width = 20,                  # 图片宽度
    plot_height = 18,                 # 图片高度
    Orthogroup_all = NULL,
    orth_gen = NULL,
    result = NULL,

    initialize = function(orthogroups_file, unassigned_genes_file, output_prefix = "venn_diagram", plot_width = 20, plot_height = 18) {
      self$orthogroups_file <- orthogroups_file
      self$unassigned_genes_file <- unassigned_genes_file
      self$output_prefix <- output_prefix
      self$plot_width <- plot_width
      self$plot_height <- plot_height
      self$load_data()
      self$process_data()
    },

    load_data = function() {
      Orthogroups <- read_tsv(self$orthogroups_file)
      Orthogroups_UnassignedGenes <- read_tsv(self$unassigned_genes_file)
      self$Orthogroup_all <- bind_rows(Orthogroups, Orthogroups_UnassignedGenes) %>%
        select(-"Hartmannula_sinica_185.fna_remove")
    },

    first_obse = function(data, n = 1, sep = ",") {
      list <- as.list(data)
      for (v in 2:length(colnames(data))) {
        for (o in 1:length(rownames(data))) {
          list[[v]][o] <- list[[v]][[o]] %>% strsplit(sep)
          list[[v]][o] <- list[[v]][[o]][n]
        }
      }
      for (v in 2:length(colnames(data))) {
        list[[v]] <- as.character(list[[v]])
      }
      as.data.frame(list)
    },

    process_data = function() {
      Orthogroup_all <- self$first_obse(self$Orthogroup_all)
      col_names <- colnames(Orthogroup_all)[-1]

      unique_symbols <- unique(gsub("[A-Za-z0-9]", "", col_names))
      split_content_list <- lapply(col_names, function(name) {
        strsplit(name, paste0("[", paste(unique_symbols, collapse = ""), "]"))[[1]]
      })

      col_renames <- sapply(split_content_list, function(split_content) {
        paste(head(split_content, 2), collapse = "_")
      })

      self$orth_gen <- lapply(seq_along(col_renames), function(i) {
        Orthogroup_all %>%
          filter(!!rlang::sym(col_names[i]) != "NA") %>%
          select(Orthogroup, !!rlang::sym(col_names[i])) %>%
          rename(!!col_renames[i] := "Orthogroup")
      })

      result <- Orthogroup_all %>% select(-Orthogroup)

      for (i in 1:length(col_names)) {
        result <- result %>%
          left_join(self$orth_gen[[i]], by = setNames(col_names[i], colnames(self$orth_gen[[i]])[2]))
      }

      self$result <- result %>% select((length(col_names) + 1):length(result))
    },

    plot_venn_diagram = function() {
      p1 <- ggVennDiagram(self$result, label = "count", edge_size = 1, label_alpha = 0) +
        scale_color_brewer(palette = "Set1") +
        scale_x_continuous(expand = expansion(mult = .2)) +
        scale_fill_distiller(palette = "Blues", direction = 1)

      p2 <- ggVennDiagram(self$result, label = "count", edge_size = 1, label_alpha = 0, 
                          set_color = c("blue", "red", "green", "purple", "orange", "yellow", "cyan")) +
        scale_color_brewer(palette = "Set1") +
        scale_x_continuous(expand = expansion(mult = .2)) +
        scale_fill_distiller(palette = "Blues", direction = 1)

      # 使用输出前缀和尺寸进行保存
      ggsave(filename = paste0(self$output_prefix, "_p1.pdf"), plot = p1, width = self$plot_width, height = self$plot_height, units = "cm")
      ggsave(filename = paste0(self$output_prefix, "_p2.pdf"), plot = p2, width = self$plot_width, height = self$plot_height, units = "cm")
    }
  )
)

Save the above code to a file, e.g. veen.r

Now, let’s test the code:

source(veen.r)

venn_analysis <- VennDiagramAnalysis$new(
  orthogroups_file = "./OrthoFinder/OrthoFinder/Results_May31/Orthogroups/Orthogroups.tsv",
  unassigned_genes_file = "./OrthoFinder/OrthoFinder/Results_May31/Orthogroups/Orthogroups_UnassignedGenes.tsv",
  output_prefix = "my_venn_diagram",  # 自定义输出前缀
  plot_width = 25,                     # 自定义宽度
  plot_height = 20                      # 自定义高度
)

venn_analysis$plot_venn_diagram()

The output will be two pdf files: my_venn_diagram_p1.pdf and my_venn_diagram_p2.pdf.

Example Two: Upset

Detailed content Click

#!/usr/bin/env Rscript
# -*- coding: utf-8 -*-
# @Author    : mengqingyao
# @Time      : 20241030

rm(list = ls())
library(readr)
library(ComplexUpset)
library(tidyverse)

Orthogroups <- read_tsv("./OrthoFinder/OrthoFinder/Results_May31/Orthogroups/Orthogroups.tsv") 
Orthogroups_UnassignedGenes <- read_tsv("./OrthoFinder/OrthoFinder/Results_May31/Orthogroups/Orthogroups_UnassignedGenes.tsv")

# 最多支持7个变量
Orthogroups <- Orthogroups %>% select(-`Hartmannula_sinica_185.fna_remove`)
Orthogroups_UnassignedGenes <- Orthogroups_UnassignedGenes %>% select(-`Hartmannula_sinica_185.fna_remove`)

first_obse = function(data, n = 1, sep = ",") {
  list <- as.list(data)
  for (v in 2:length(colnames(data))) {
    for (o in 1:length(rownames(data))) {
      list[[v]][o] <- list[[v]][[o]] %>% strsplit(sep)
      list[[v]][o] <- list[[v]][[o]][n]
    }
  }
  for (v in 2:length(colnames(data))) {
    list[[v]] <- as.character(list[[v]])
  }
  as.data.frame(list)
}

Orthogroups <- first_obse(Orthogroups)
Orthogroups_all <- rbind(Orthogroups,Orthogroups_UnassignedGenes)

#  eggnog注释信息

all_7_species_emapper_mod <- read_delim("eggNOG_test/7_species.emapper_mod.annotations", 
                                        delim = "\t", escape_double = FALSE, 
                                        trim_ws = TRUE)

#  KASS注释信息
# 设置文件夹路径
kegg_path <- "./KEGG_test"  # 请替换为您的KEGG文件夹的实际路径

# 列出KEGG文件夹中的所有文件
kegg_files <- list.files(kegg_path)

# 识别唯一的符号
unique_symbols <- unique(gsub("[A-Za-z0-9]", "", kegg_files))

# 按符号全部分割
split_content_list <- lapply(kegg_files, function(name) {
  strsplit(name, paste0("[", paste(unique_symbols, collapse = ""), "]"))[[1]]
})

# 保留分割后的前两部分，“_”连接、
kegg_files_mod_1 <- sapply(split_content_list, function(split_content) {
  paste(head(split_content, 2),collapse = "_")
})

kegg_files_mod_2 <- sapply(split_content_list, function(split_content) {
  # 取前两个元素
  first_two <- head(split_content, 2)
  # 用paste将它们组合，并添加"_KO"（确保不重复）
  result <- paste(first_two, collapse = "_")
  # 确保只添加一个"_KO"
  if (!grepl("_KO$", result)) {  # 检查是否以"_KO"结尾
    result <- paste(result, "_KO", sep = "")
  }
  return(result)
})

kegg_files_mod_3 <- sapply(split_content_list, function(split_content) {
  first_two <- head(split_content, 2)
  result <- paste(first_two, collapse = "_")
  if (!grepl("_COG$", result)) {  # 检查是否以"_KO"结尾
    result <- paste(result, "_COG", sep = "")
  }
  return(result)
})

kegg_files_mod_4 <- sapply(split_content_list, function(split_content) {
  first_two <- head(split_content, 2)
  result <- paste(first_two, collapse = "_")
  if (!grepl("_merge$", result)) {  # 检查是否以"_KO"结尾
    result <- paste(result, "_merge", sep = "")
  }
  return(result)
})

kegg_files_mod_5 <- sapply(split_content_list, function(split_content) {
  first_two <- head(split_content, 2)
  result <- paste(first_two, collapse = "_")
  if (!grepl("_pre$", result)) {  # 检查是否以"_KO"结尾
    result <- paste(result, "_pre", sep = "")
  }
  return(result)
})

# 读取文件
kegg_list <- list()  # 确保 kegg_list 已初始化

for (i in 1:length(kegg_files)) {
  # 读取文件
  kegg_list[[i]] <- read_delim(paste("KEGG_test/", kegg_files[i], sep = ""), 
                               delim = "\t", 
                               escape_double = FALSE, 
                               col_names = FALSE, 
                               trim_ws = TRUE)
  
  # 修改列名
  names(kegg_list[[i]])[1] <- "query"
  names(kegg_list[[i]])[2] <- kegg_files_mod_1[i]  # 使用正确的 kegg_files_mod
  
  # 添加新的列，符合 dplyr 的用法
  kegg_list[[i]] <- kegg_list[[i]] %>%
    mutate(!!sym(kegg_files_mod_2[i]) := 1)  # 使用 !!sym() 确保列名正确
  
}

ann_gene_asso <- data.frame()

ann_gene_asso <- Reduce(function(x, y) merge(x, y, by = "query", all = TRUE), 
                        kegg_list)

ann_gene_asso <- merge(all_7_species_emapper_mod, ann_gene_asso, by = "query", all = TRUE)

ann_gene_asso <- ann_gene_asso %>% select(query,`eggNOG_OGs`,all_of(kegg_files_mod_1), all_of(kegg_files_mod_2))

#  分离单个物种
results_separate <- lapply(1:length(kegg_files_mod_2), function(i) {
  ann_gene_asso %>% 
    filter(get(kegg_files_mod_2[i]) == 1) %>% 
    select(query, eggNOG_OGs, !!sym(kegg_files_mod_1[i]))  %>% # 使用 !!sym() 动态获取列名
    mutate(!!kegg_files_mod_5[i] := 1)
})
Orthogroups_all_names <- sort(names(Orthogroups_all)[-1])

dataframe_list <- lapply(seq_along(Orthogroups_all_names), function(i) {
  # 检查当前的列名
  left_col_name <- Orthogroups_all_names[i]
  
  if (!left_col_name %in% colnames(Orthogroups_all)) {
    stop(paste("Column", left_col_name, "not found in Orthogroups_all"))
  }
  
  Orthogroups_all %>% 
    left_join(results_separate[[i]], by = setNames("query", Orthogroups_all_names[i])) %>% 
    select(-all_of(Orthogroups_all_names)) %>% 
    rename(!!kegg_files_mod_3[i] := eggNOG_OGs)
})

#  使用bind_rows合并列表中的所有数据框
combined_df <- bind_rows(dataframe_list)
combined_df <- combined_df %>% arrange(Orthogroup)

# 使用 dplyr 进行分组和串联
results_df <- combined_df %>%
  group_by(Orthogroup) %>%
  summarize(across(everything(), ~ paste(na.omit(.), collapse = ", ")), .groups = 'drop')

results_df[results_df == ""] <- NA

results_df <- results_df %>%
  mutate(across(-1, ~ ifelse(is.na(.), 0, 1)))

results_class <- results_df %>%
  mutate(
    classification = case_when(
      rowSums(across(ends_with("_COG"), ~ as.numeric(as.character(.)))) >= 1 &
        rowSums(across(all_of(kegg_files_mod_1), ~ as.numeric(as.character(.)))) >= 1 ~ "COG and KEGG",  # 首先检查同时存在 COG 和 KEGG 的情况
      rowSums(across(ends_with("_COG"), ~ as.numeric(as.character(.)))) >= 1 ~ "COG",  
      rowSums(across(-c(ends_with("_COG"), Orthogroup), ~ as.numeric(as.character(.)))) >= 1 ~ "KEGG",  
      rowSums(across(-Orthogroup, ~ as.numeric(as.character(.)))) == 0 ~ "Unknown",
      TRUE ~ "Other"
    )
  )

results_class_1 <- results_class  # 复制原始数据框

# 逐个处理每对列并生成新列
for (i in seq_along(kegg_files_mod_1)) {
  results_class_1 <- results_class_1 %>%
    mutate(!!kegg_files_mod_4[i] := ifelse(
      rowSums(across(c(kegg_files_mod_3[i], kegg_files_mod_1[i])),
              na.rm = TRUE) >= 1,
      1,
      0
    ))
}

results_class_1 <- results_class_1 %>%
  select(-all_of(c(kegg_files_mod_3, kegg_files_mod_1, kegg_files_mod_4)))

#  增加颜色映射
CC <- c(`COG` = '#3C5488B2', `COG and KEGG` = '#4DBBD5B2', `KEGG` = '#FF3333', `Unknown` = '#DDDDDD')

#  画图
p1 <- upset(results_class_1,
            kegg_files_mod_5,
            width = .1,
            wrap = T,
            min_size = 29,
            #  按照交集类型和交集大小排列
            sort_intersections_by=c('degree', 'cardinality'),
            base_annotations = list(
              "intersection size" = intersection_size(
                counts = T,
                mapping = aes(fill=classification) 
              )
              +theme(axis.line.y = element_line(colour = "black",size=0.5),axis.ticks.y=element_line(colour = "black",size=0.5))+
                scale_y_continuous(expand = c(0,0))+
                scale_fill_manual(values = CC)
            ),
            set_sizes=(
              upset_set_size()+ geom_text(aes(label=..count..), hjust=1, stat='count') 
              + expand_limits(y=2300)
              + theme(axis.text.x=element_text(angle=90))
            ))

ggsave(filename = "test.pdf", p1, width = 35, height = 20, units = "cm")

result

In the case of unknown genes that were not assigned to species, it is because for the homozygous gene analysis I used 8 species, and in this analysis I removed one.

Email me with more questions! 584338215@qq.com

Share on

Twitter Facebook LinkedIn

Mengqingyao

Old code refurbishment！

Introduction

Example One: Venn Diagram

Now, let’s test the code:

Example Two: Upset

result

In the case of unknown genes that were not assigned to species, it is because for the homozygous gene analysis I used 8 species, and in this analysis I removed one.

Share on

You May Also Enjoy

Gene prediction in eukaryotes is exemplified by Euplotes!

GenBank database download tool！

Extract Reads With Telomeric Reads!

KEGG annotation information collection and update！