# Copyright (C) 2017  Wilson Wen Bin GOH
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

#library(testthat)
#library(devtools)
#library(roxygen2)
#to build a vignette directory
#use_vignette("NetProt")
#cmd + shift + k to execute the vignette generation

library("e1071")
## Warning: package 'e1071' was built under R version 3.2.5
library("genefilter")
## Warning: package 'genefilter' was built under R version 3.2.3
## 
## Attaching package: 'genefilter'
## The following object is masked from 'package:base':
## 
##     anyNA
library("pheatmap")
## Warning: package 'pheatmap' was built under R version 3.2.3
library("limma")
## Warning: package 'limma' was built under R version 3.2.4
library("pvclust")
library("vioplot")
## Loading required package: sm
## Package 'sm', version 2.2-5.4: type help(sm) for summary information

@title This is the RCC renal cancer control data to be included in my package

@name RCC @docType data @author Wilson Goh @references @keywords data

NULL
## NULL

@title This is the RC renal cancer data to be included in my package

@name RC @docType data @author Wilson Goh @references @keywords data

NULL
## NULL

@title This is the CR colorectal cancer data to be included in my package

@name CR @docType data @author Wilson Goh @references @keywords data

NULL
## NULL
#generic functions --- used by several methods

@title A test function for the t-test @name my.t.test.p.value

my.t.test.p.value <- function(...)
{
  obj<-try(t.test(...), silent=TRUE)
  if (is(obj, "try-error")) return(NA) else return(obj$p.value)
}

@title Gene Fuzzy Scoring (GFS) @name gfs @param vec a vector of values corresponding to one sample @return new_vec A vector of GFS transformed values @export

gfs <- function(vec)
{
  #alpha_1 <- x * length(vec)
  #alpha_2 <- y * length(vec)
  alpha_1 <- 0.1 * length(vec)
  alpha_2 <- 0.2 * length(vec)

  ranks <- rank(-vec)

  new_vec <- vec

  new_vec[ranks[1:round(alpha_1)]] <- 1
  new_vec[ranks[round(alpha_2)+1:length(vec)]] <- 0
  new_vec[ranks[round(alpha_1)+1:(round((alpha_2 - alpha_1)/5))]] <- 0.8 #bin 1
  new_vec[ranks[(round(alpha_1 + (alpha_2 - alpha_1)/5))+1:(round((alpha_2 - alpha_1)/5))]] <- 0.6 #bin 2
  new_vec[ranks[(round(alpha_1 + 2 * (alpha_2 - alpha_1)/5))+1:(round((alpha_2 - alpha_1)/5))]] <- 0.4 #bin 3
  new_vec[ranks[(round(alpha_1 + 3 * (alpha_2 - alpha_1)/5))+1:(round((alpha_2 - alpha_1)/5))]] <- 0.2 #bin 4
  new_vec[ranks[(round(alpha_1 + 4 * (alpha_2 - alpha_1)/5))+1:(round((alpha_2 - alpha_1)/5))]] <- 0.1 #bin 5

  return(new_vec)
}

@title Gene No Fuzzy Scoring (GNFS) @name gnfs @param vec a vector of values corresponding to one sample @return new_vec A vector of GNFS transformed values @export

gnfs <- function(vec)
{
  #alpha_1 <- x * length(vec)
  #alpha_2 <- y * length(vec)
  alpha_1 <- 0.1 * length(vec)
  #alpha_2 <- 0.2 * length(vec)

  ranks <- rank(-vec)

  new_vec <- vec

  new_vec[ranks[1:round(alpha_1)]] <- 1
  new_vec[ranks[round(alpha_1)+1:length(vec)]] <- 0

  return(new_vec)
}

@title Matrix Binarization @name binarize_mat @param vec a data vector @return new_mat A binarized data vector @export

binarize_mat <- function(vec)
{
  vec[!is.na(vec)] <- 1
  vec[is.na(vec)] <- 0
  return(vec)
}

@title This generates a vector of p-values for a data matrix using the two-sample t-test @name standard_t_test @param x data matrix @param y significance level @return id_output A list of 2 p-value vectors, x vs y and y vs x @export

standard_t_test <- function(x,y)
{

  ttest_pvals_1 <- c()
  ttest_pvals_2 <- c()

  #lets play with the first matrix
  classes_1 <- as.factor(matrix(unlist(strsplit(colnames(x[[1]]), "_")), ncol=ncol(x[[1]]))[1,])      #factors of the rank weight matrix
  cancer_class_1 <- x[[1]][, grep(levels(classes_1)[1], colnames(x[[1]]))]
  normal_class_1 <- x[[1]][, grep(levels(classes_1)[2], colnames(x[[1]]))]

  for (i in 1:nrow(x[[1]]))
  {
    test_1 <- my.t.test.p.value(normal_class_1[i,], cancer_class_1[i,], alternative="greater",var.equal = FALSE,paired = FALSE)
    ttest_pvals_1 <- append(ttest_pvals_1, test_1)
  }
  ttest_pvals_1[is.na(ttest_pvals_1)] <- 1
  names(ttest_pvals_1) <- rownames(x[[1]])

  #out1 <- x[[1]][names(ttest_pvals_1)[ttest_pvals_1 <= y],]

  #############
  classes_2 <- as.factor(matrix(unlist(strsplit(colnames(x[[2]]), "_")), ncol=ncol(x[[2]]))[1,])      #factors of the rank weight matrix
  cancer_class_2 <- x[[2]][, grep(levels(classes_2)[1], colnames(x[[2]]))]
  normal_class_2 <- x[[2]][, grep(levels(classes_2)[2], colnames(x[[2]]))]

  for (i in 1:nrow(x[[2]]))
  {
    test_2 <- my.t.test.p.value(cancer_class_2[i,], normal_class_2[i,], alternative="greater",var.equal = FALSE,paired = FALSE)
    ttest_pvals_2 <- append(ttest_pvals_2, test_2)
  }
  ttest_pvals_2[is.na(ttest_pvals_2)] <- 1
  names(ttest_pvals_2) <- rownames(x[[2]])

  #out2 <- x[[2]][names(ttest_pvals_2)[ttest_pvals_2 <= y],]

  id_output <- as.list(c())
  #id_output <- union(names(ttest_pvals_1)[ttest_pvals_1 <= y], names(ttest_pvals_2)[ttest_pvals_2 <= y])
  id_output[[1]] <- names(ttest_pvals_1)[ttest_pvals_1 <= y]
  id_output[[2]] <- names(ttest_pvals_2)[ttest_pvals_2 <= y]
  return(id_output)
}

@title This generates a vector of p-values for a data matrix using the two-sample t-test @name qpsp_standard_t_test @param x data matrix @param y significance level @return id_output A p-value vectors @export

qpsp_standard_t_test <- function(x, y) #x is the data matrix, and y is the significance level
{
  classes <- as.factor(matrix(unlist(strsplit(colnames(x), "_")), ncol=ncol(x))[1,])      #factors of the rank weight matrix
  normal_vals_mean <- rowMeans(x[, grep(levels(classes)[1], colnames(x))])
  cancer_vals_mean <- rowMeans(x[, grep(levels(classes)[1], colnames(x))])

  x <- t(scale(t(x)))

  ttest <- rowttests(as.matrix(x), classes,  tstatOnly = FALSE)
  ttest_pvals <- ttest$p.value
  ttest_pvals[is.na(ttest_pvals)] <- 1

  names(ttest_pvals) <- rownames(x)
  out <- x[names(ttest_pvals)[ttest_pvals <= y],]
  return(out)
}

@title Generates a weight matrix by applying the generate weight vector function to all columns @name generate_weight_matrix @param x data matrix @return weight_matrix A weight matrix @export

generate_weight_matrix <- function(x)
{
  weight_matrix <- apply(x, 2, generate_weight_vector)
  rownames(weight_matrix) <- rownames(x)
  return(weight_matrix)
}

@title Generates a weight vector @name generate_weight_vector @param x data matrix @return output A weight matrix @export

generate_weight_vector <- function(x)
{
  output <- c()
  max <- 1
  first_percentile <- (0.9 * max)
  second_percentile_1 <- (0.875 * max)
  second_percentile_2 <- (0.85 * max)
  second_percentile_3 <- (0.825 * max)
  second_percentile_4 <- (0.80 * max)

  #we are going from 1 to 0.8, 0.6,0.4,0.2, and 0s for all else
  for (i in 1:length(x))
  {
    if (x[i] >= first_percentile)
    {
      output <- append(output, "1")
    }
    else if (x[i] >= second_percentile_1)
    {
      output <- append(output, "0.8")
    }
    else if (x[i] >= second_percentile_2)
    {
      output <- append(output, "0.6")
    }
    else if (x[i]  >= second_percentile_3)
    {
      output <- append(output, "0.4")
    }
    else if (x[i]  >= second_percentile_4)
    {
      output <- append(output, "0.2")
    }
    else
    {
      output <- append(output, "0")
    }
  }

  output <- as.numeric(output)
  return(output)
}

@title Generates a rank matrix @name generate_rank_matrix @param x data matrix @return output A rank matrix @export

generate_rank_matrix <- function(x)
{
  rank_matrix <- apply(x, 2, rank)
  #rank_matrix <- (nrow(x) - rank_matrix)/nrow(x)
  rank_matrix <- (rank_matrix)/nrow(x)

  rownames(rank_matrix) <- rownames(x)
  colnames(rank_matrix) <- colnames(x)

  return(rank_matrix)  #dont forget to assign the column and row names later
}

@title Generates a modulated weight matrix @name subset_weights @param x data matrix @return output A modulated weight matrix list @export

subset_weights <- function(x) #this provides the weights for each class by calculating the average weight for that particular gene for that class and is then used for multiplying against the original matrix
{
  output <- c()
  output <- as.list(output)
  classes <- as.factor(matrix(unlist(strsplit(colnames(x), "_")), ncol=ncol(x))[1,])        #factors of the rank weight matrix
  normal_vals_mean <- rowMeans(x[, grep(levels(classes)[2], colnames(x))]) #this provides the vote for the particular gene for class A
  cancer_vals_mean <- rowMeans(x[, grep(levels(classes)[1], colnames(x))]) #this provides the vote for the particular gene for class B

  #colnames(modulator_matrix_A) <- NULL
  #colnames(modulator_matrix_B) <- NULL
  new_mat_A <- (normal_vals_mean * x)
  new_mat_B <- (cancer_vals_mean * x)

  colnames(new_mat_A) <- colnames(x)
  colnames(new_mat_B) <- colnames(x)
  rownames(new_mat_A) <- rownames(x)
  rownames(new_mat_B) <- rownames(x)

  output[[1]] <- new_mat_A
  output[[2]] <- new_mat_B
  #output <- append(output, new_mat_A)
  #output <- append(output, new_mat_B)
  return (output)
}


#ESSNET methods

@title This generates all paired differences, deltas, for each gene between samples in class A and B @name internal_substraction @param group_A samples in group A @param group_A samples in group B @return all_diffs A matrix of deltas between class A and B @export

internal_substraction <- function(group_A, group_B) #returns an array of subtraction vectors for all genes/proteins
{
  all_diffs <- as.data.frame(c())

  size = length(group_A)
  sel<-cbind(rep(1:size,each=size),rep(1:size,times=size))

  for (x in 1:nrow(group_A))
  {
    A <- group_A[x,]
    B <- group_B[x,]
    delta=as.numeric(A[sel[,1]]-B[sel[,2]])
    all_diffs <- rbind(all_diffs, delta)
    #append the diff to the all diffs here
  }
  row.names(all_diffs) <- row.names(group_A)
  return(all_diffs)
}

@title Based on the output of internal_substraction, this calculates a p-value for each complex @name essnet @param diff_matrix A matrix of deltas derived from internal_substraction @param complete_matrix The original expression matrix @param overlap_threshold The required minimal overlap between the proteomics data and the network @return p_val_vec A vector of p-values, one for each complex @export

essnet <- function(diff_matrix, complex_matrix, overlap_threshold)
{
  p_val_vec <- c()
  complex_names <- names(complex_matrix) #contains the names of the complexes

  for (i in 1:length(complex_matrix))
  {
    overlap_size <- sum(as.numeric(rownames(diff_matrix) %in% complex_matrix[[i]])) #sums the number of trues
    complex_components <- complex_matrix[[i]]
    if (overlap_size <= overlap_threshold)
    {
      p_val_vec <- append(p_val_vec, 0)
    }
    else
    {
      list_vector <- c()
      subset <- diff_matrix[rownames(diff_matrix) %in% complex_components,]
      for (j in 1:nrow(subset))
      {
        list_vector <- append(list_vector, as.numeric(subset[j,]))
      }
      #p_val_vec <- append(p_val_vec, t.test(list_vector)$p.value)

      #size <- length(list_vector)
      #size <- sqrt(length(list_vector))
      #size <- overlap_size * sqrt(length(list_vector))
      size <- sqrt(overlap_size) * sqrt(length(list_vector)) #here we reduce the effect of k by taking square root of k
      sterr<-(sd(list_vector)/sqrt(size-1))
      stat<-mean(list_vector)/sterr
      t.pval <- sum(abs(rt(1000,size-1))>abs(stat))/1000
      p_val_vec <- append(p_val_vec, t.pval)
    }
  }
  names(p_val_vec) <- complex_names
  return(p_val_vec)
}


#SNET methods

@title Generates a vector of snet weights for a given sample @name snet_weight_vector @param x An expression vector @return output A vector of weights @export

snet_weight_vector <- function(x)
{
  output <- c()
  max <- 1
  first_percentile <- (0.9 * max)

  #we are going from 1 to 0.8, 0.6,0.4,0.2, and 0s for all else
  for (i in 1:length(x))
  {
    if (x[i] >= first_percentile)
    {
      output <- append(output, "1")
    }
    else
    {
      output <- append(output, "0")
    }
  }
  output <- as.numeric(output)
  return(output)
}

@title Generates a matrix of snet weights for a given data matrix @name snet_generate_weight_matrix @param x An expression matrix @return output A matrix of weights @export

snet_generate_weight_matrix <- function(x)
{
  weight_matrix <- apply(x, 2, snet_weight_vector)
  rownames(weight_matrix) <- rownames(x)
  return(weight_matrix)
}

#FSNET methods

@title Based on the rank weight matrix and the list of complexes, give a matrix of scores per complex @name fsnet @param rank_weight_matrix A matrix of weighted ranks @param complex_matrix The original expression matrix @param overlap_threshold The required minimal overlap between the proteomics data and the network @return output A list of two matrices of fsnet scores, x vs y, and y vs x, for complexes for all samples @export

fsnet <- function(rank_weight_matrix, complex_matrix, overlap_threshold) #this one returns a matrix of the fsnet scores of samples vs subnets which can then be evaluated by a fsnet_pval calculation approach
{
  fsnet_matrix_A <- c() #this will vectors of fsnet scores that can be assimilated into a matrix
  fsnet_matrix_B <- c() #this will vectors of fsnet scores that can be assimilated into a matrix

  for (i in 1:length(complex_matrix))
  {
    overlap_size <- sum(as.numeric(rownames(rank_weight_matrix[[1]]) %in% complex_matrix[[i]])) #sums the number of trues
    complex_components <- complex_matrix[[i]]
    if (overlap_size <= overlap_threshold)
    {
      fsnet_matrix_A <- rbind(fsnet_matrix_A,rep(0, ncol(rank_weight_matrix[[1]])))
      fsnet_matrix_B <- rbind(fsnet_matrix_B,rep(0, ncol(rank_weight_matrix[[2]])))
    }
    else
    {
      subset_A <- rank_weight_matrix[[1]][rownames(rank_weight_matrix[[1]]) %in% complex_components,]
      subset_B <- rank_weight_matrix[[2]][rownames(rank_weight_matrix[[2]]) %in% complex_components,]
      #here if we want to calculate the class modulator we can do so by splitting the classes by names and calculating the average normalized rank
      #subset <- subset_weights(subset)
      complex_sums_A <- colSums(subset_A) #otherwise, we simply add the rank_weight values up.
      complex_sums_B <- colSums(subset_B) #otherwise, we simply add the rank_weight values up.
      fsnet_matrix_A <- rbind(fsnet_matrix_A, complex_sums_A)
      fsnet_matrix_B <- rbind(fsnet_matrix_B, complex_sums_B)
    }
  }
  rownames(fsnet_matrix_A) <- names(complex_matrix)
  rownames(fsnet_matrix_B) <- names(complex_matrix)
  colnames(fsnet_matrix_A) <- colnames(rank_weight_matrix[[1]])
  colnames(fsnet_matrix_B) <- colnames(rank_weight_matrix[[2]])

  output <- as.list(c())
  output[[1]] <- fsnet_matrix_A
  output[[2]] <- fsnet_matrix_B
  return(output)
}

#PFSNET methods

@title Uses the original FSNET matrix but applies PFSNET’s calculation method @name pfsnet_theoretical_t_test @param x FSNET matrix @param y test significance level @return out p-values based on pfsnet @export

pfsnet_theoretical_t_test <- function(x, y) #where x is the fsnet matrix and y is the significance level of test
{
  #x <- t(scale(t(x)))
  #lets play with the first matrix
  #classes <- as.factor(matrix(unlist(strsplit(colnames(x[[1]]), "_")), ncol=ncol(x[[1]]))[1,])      #factors of the rank weight matrix
  #normal_class <- x[[1]][, grep(levels(classes)[1], colnames(x[[1]]))]
  #cancer_class <- x[[1]][, grep(levels(classes)[2], colnames(x[[1]]))]

  class_A <- x[[1]] #matrix 1
  class_B <- x[[2]] #matrix 2
  pvals <- c()

  for (i in 1:nrow(class_A)) #for each complex
  {
    all_pairs <- expand.grid(class_A[i,],class_B[i,]) #generate all pairs
    differences <- all_pairs[,1] - all_pairs[,2]

    size <- sqrt(length(differences))
    sterr <- sd(differences)/(size - 1)
    stat <- mean(differences)/sterr
    t.pval <- sum(abs(rt(1000,size-1))>abs(stat))/1000
    pvals <- append(pvals, t.pval)
  }
  pvals[is.na(pvals)] <- 1
  names(pvals) <- rownames(x[[1]])
  out <- pvals[pvals <= y]
  return(out)
}

#PPFSNET methods

@title Uses the original FSNET matrix but applies PPFSNET’s calculation method @name ppfsnet_theoretical_t_test @param x FSNET matrix @param y test significance level @return out p-values based on pfsnet @export

ppfsnet_theoretical_t_test <- function(x, y) #where x is the fsnet matrix and y is the significance level of test
{
  #first we generate two matrices of all paired differences

  #lets play with the first matrix
  classes <- as.factor(matrix(unlist(strsplit(colnames(x[[1]]), "_")), ncol=ncol(x[[1]]))[1,])      #factors of the rank weight matrix

  pvals <- c()
  for (i in 1:nrow(x[[1]]))
  {
    #cancer_class_A <- x[[1]][i, grep(levels(classes)[1], colnames(x[[1]]))]
    normal_class_A <- x[[1]][i, grep(levels(classes)[2], colnames(x[[1]]))]

    cancer_class_B <- x[[2]][i, grep(levels(classes)[1], colnames(x[[2]]))]
    #normal_class_B <- x[[2]][i, grep(levels(classes)[2], colnames(x[[2]]))]

    #all_pairs_A <- expand.grid(cancer_class_A, cancer_class_B)
    #all_pairs_B <- expand.grid(normal_class_A, normal_class_B)

    all_pairs_B_vs_A <- expand.grid(cancer_class_B, normal_class_A)
    all_pairs_B_vs_A <- abs(all_pairs_B_vs_A[,1] - all_pairs_B_vs_A[,2])
    #differences_A <-all_pairs_A[,1] - all_pairs_A[,2]
    #differences_B <-all_pairs_B[,1] - all_pairs_B[,2]

    #all_pairs <- expand.grid(differences_A, differences_B)

    #differences <- all_pairs[,1] - all_pairs[,2]
    differences <- all_pairs_B_vs_A

    size <- sqrt(length(differences)/2)
    sterr <- sd(differences)/(size - 1)
    stat <- mean(differences)/sterr
    t.pval <- sum(abs(rt(1000,size-1))>=abs(stat))/1000
    pvals <- append(pvals, t.pval)
  }
  pvals[is.na(pvals)] <- 1
  names(pvals) <- rownames(x[[1]])
  out <- pvals[pvals <= y]
  return(out)
}



#HE

@title The hypergeometric enrichment pipeline @name he @param x A matrix of deltas derived from internal_substraction @param y A list of complexes @param z Class factors @return hyp_pval A vector of p-values, one for each complex @export

he <- function(x, y, z) # where x is the data matrix, y is the complex vector, and z is the factor
{
  hyp_pval <- c()

  for (n in 1:length(y))
  {
    test_ttest <- rowttests(as.matrix(x), z)
    pvals <- test_ttest$p.value
    pvals <- p.adjust(pvals, method="bonferroni")
    sig_prot_ttest <- rownames(x)[which(pvals <= 0.05)]

    q <- length(intersect(y[[n]], sig_prot_ttest)) #x is the intersection
    k <- length(y[[n]]) #size of the complex
    N <- nrow(x) #all the proteins
    m <- length(sig_prot_ttest)

    pval <- 1- phyper(q-1, m, N-m, k)
    hyp_pval <- append(hyp_pval, pval)
  }
  return(hyp_pval)
}


#GSEA

@title The original GSEA algorithm based on the Kolmogorov-Smirnov @name gsea @param data expression matrix @param complex_vector A list of complexes @param factors levels indicating the class of samples in the expression matrix @return gsea_pval A vector of p-values; one for each complex @export

gsea <- function(data, complex_vector, factors)
{
  gsea_pval <- c()
  pvals <- rowttests(as.matrix(data), factors)$p.value
  names(pvals) <- rownames(data)
  ranks <- rank(pvals)
  #first lets calculate all the t-scores then convert to ranks

  for (x in 1:length(complex_vector))
  {
    if (length(ranks[which(names(ranks) %in% complex_vector[[x]])]) >= 1)
    {
      ks_pval <- ks.test(ranks[which(names(ranks) %in% complex_vector[[x]])], ranks[which(!names(ranks) %in% complex_vector[[x]])])$p.value
      gsea_pval <- append(gsea_pval, ks_pval)
    }
    else
    {
      gsea_pval <- append(gsea_pval, 1)
    }
  }
  gsea_pval <- p.adjust(gsea_pval, method="fdr") #turn this off if do not want to do multiple test correction
  return(gsea_pval)
}

#QPSP methods

@title QPSP rank matrix generation function @name qpsp_generate_rank_matrix @param x expression matrix @return rank_matrix A matrix of weighted ranks @export

qpsp_generate_rank_matrix <- function(x)
{
  rank_matrix <- apply(x, 2, rank)
  #rank_matrix <- rank_matrix/nrow(x)
  rank_matrix <- (rank_matrix)/nrow(x)

  rownames(rank_matrix) <- rownames(x)
  colnames(rank_matrix) <- colnames(x)

  return(rank_matrix)  #dont forget to assign the column and row names later
}

@title QPSP rank matrix generation function @name qpsp @param rank_weight_matrix A matrix of weights based on ranks @param complex_list A list of complexes @return output A vector of p-values @export

qpsp <- function(rank_weight_matrix, complex_list)
{
  qpsp_matrix <- c()

  for (j in 1:length(complex_list))
  {
    #print(j)
    if (length(rownames(rank_weight_matrix)[which(rownames(rank_weight_matrix) %in% complex_list[[j]])]) > 1)
    {
      qpsp_matrix <- rbind(qpsp_matrix, colSums(rank_weight_matrix[rownames(rank_weight_matrix)[which(rownames(rank_weight_matrix) %in% complex_list[[j]])],]))
    }
    else if (length(rownames(rank_weight_matrix)[which(rownames(rank_weight_matrix) %in% complex_list[[j]])]) == 1)
    {
      qpsp_matrix <- rbind(qpsp_matrix, rank_weight_matrix[rownames(rank_weight_matrix)[which(rownames(rank_weight_matrix) %in% complex_list[[j]])],])
    }
    else
    {
      qpsp_matrix <- rbind(qpsp_matrix, c(rep(0, ncol(rank_weight_matrix))))
    }

  }

  rownames(qpsp_matrix) <- names(complex_list)

  output <- qpsp_standard_t_test(qpsp_matrix, 0.05)
  return(output)
}

#Simulate datasets

@title Proteomics data simulation function @name generate_proteomics_sim @param data A one class data matrix @param fact Factors @param prop Proportion of variables to insert effect size @param rounds Number ofsimulations @param effect_size A vector of intended effect sizes @return prop A set of simulated datasets @export

generate_proteomics_sim <- function(data, fact, prop, rounds, effect_size) #data is one class data, factor assigns class labels to data, prop is the proportion of variables to assign effect size, rounds is number of simulated data to geenrate, effect_size is a vector of effect sizes to sample from
{
  for (i in 1:rounds)
  {
    print(i)
    norm <- which(fact == levels(fact)[1])
    cancer <- which(fact == levels(fact)[2])

    null_mat <- matrix(rep(1,nrow(data)*ncol(data)), nrow= nrow(data), ncol=ncol(data)) #the multiplication factor
    null_null_mat <- matrix(rep(0,nrow(data)*ncol(data)), nrow= nrow(data), ncol=ncol(data)) #determines the adjustment factor

    diff <- sample(1:nrow(data), round(prop * nrow(data))) #choose 20%
    null_null_mat[diff, cancer] <- replicate(length(diff), sample(effect_size, 1))

    null_mat <- null_mat + null_null_mat

    data_adjust <- data * null_mat

    factor_vec <- rep("ns", nrow(data))
    factor_vec[diff] <- 'sig'

    data_adjust <- cbind(rep(rounds, nrow(data)), data_adjust, factor_vec)
    new_colnames <- c("GeneLength", as.vector(fact), "sig")
    colnames(data_adjust) <- new_colnames

    write.table(data_adjust, file=paste("sim.",i, ".txt",sep=""), sep="\t", row.names=T, col.names=T, quote=F)
  }
}

#Simulate complex

@title Pseudo-complex simulation function @name generate_pseudo_cplx @param tdata A one class data matrix @param fact Factors @param purity Proportion of differential proteins in complex @return prop A set of simulated pseudo-complexes @export

generate_pseudo_cplx <- function(tdata, fact, purity) #pseudo complex generator
{
  complex_vector <- c()
  data <- tdata
  data <- data[,-1]
  data <- data[,-9]
  #colnames(data) <- c(paste('N_', 1:4), paste('C_', 1:4))

  sig_prot <- rownames(tdata[which(tdata[,10]=='sig'),])
  ns_prot <- sample(rownames(tdata[which(tdata[,10]=='ns'),]), length(rownames(tdata[which(tdata[,10]=='sig'),])))

  cor_sig_clust <- hclust(dist(tdata[sig_prot,2:9]), method="ward.D")
  cor_ns_clust <- hclust(dist(tdata[ns_prot,2:9]), method="ward.D")

  sig_prot <- sig_prot[cor_sig_clust$order]
  ns_prot <- ns_prot[cor_ns_clust$order]

  sig_prot_complex <- split(sig_prot, sort(c(rep(rep(1:50), 10))))
  names(sig_prot_complex) <- paste("sig_",names(sig_prot_complex), sep='')

  ns_prot_complex <- split(ns_prot, sort(c(rep(rep(1:50), 10))))
  names(ns_prot_complex) <- paste("ns_",names(ns_prot_complex), sep='')

  for (j in 1:length(sig_prot_complex)) ######purity tests ---- turn this off if not wanted~! This substitutes the n% with proteins from NS
  {
    sig_prot_complex[[j]][1:round(((1 - purity)*length(sig_prot_complex[[j]])))] <- ns_prot_complex[[j]][1:round(((1 - purity)*length(sig_prot_complex[[j]])))]
  }

  complex_vector <- append(sig_prot_complex, ns_prot_complex)

  return(complex_vector)
}