calcDuplicateFreqInTransc = function(r,L,N,insert=1){
l = r*N/1000/insert
z = r*N*L/1000
c(total=z,dupl=z - (1-exp(-l))*L*insert)
}
#' @param rs vector of gene RPKMs
#' @param ls vector of gene lengths (longest transcript was used)
#' @param N lib size (in millions)
#' @param insert insert variability
calcDuplicateFreqInLib = function(rs,ls,N,insert=1){
rs = rs / sum(rs*ls/1e9) #RPKM was estimated using different gene models (only constant exons were considered) so, RPKM need to be adjusted to fit library size
z=apply(sapply(1:length(rs),function(i)calcDuplicateFreqInTransc(rs[i],ls[i],N,insert)),1,sum)
z[2]=z[2]/z[1]
z
}
#mean.exp = real human RPKM of protein coding genes
#gene.len = lengths of longest transcripts of the same genes as in mean.exp
calcDuplicateFreqInLib(mean.exp,gene.len,20,1)