#!/usr/bin/python import optparse, os, time, math, numpy start_time = time.time() usage = "%prog [options]" version = "%prog 4.0" description = "Does NIPT analysis from tableOK to end. Now with improoved gender prediction" #Handle the commands line options parser = optparse.OptionParser(usage=usage, version=version, description=description) parser.add_option("-i", "--infile", dest="infile", help="table with k-mer counts. Colnames chromosome, rownames samples") parser.add_option("-o", "--outfile", dest="outfile", help="Final output file.") parser.add_option("-r", "--control", dest="control", help="(optional) list of patient names to be used as control group") parser.add_option("-n", "--norm", dest="norm", action="store_true", default=False, help="(optional) flag to swich normalization on") parser.add_option("-d", "--debug", dest="debug", action="store_true", default=False, help="(optional) flag used to write out intermediate failes to debug") parser.add_option("-e", "--extra", dest="extra", help="(optional) coma separated list of extra column names that should be included as parameters in addition to chromosomes and GC. Use as -e col1,col2,col3") parser.add_option("-b", "--boy", dest="boy", default="0.02", help="threshold over what the baby is a boy (default 0.02)") parser.add_option("-g", "--girl", dest="girl", default="0.01", help="threshold under what the baby is a girl (default 0.01)") (options, args) = parser.parse_args() def readIn(): with open(options.infile, 'r') as ifile: data = [line.strip().split('\t') for line in ifile] return data def numData(data): for i in range(len(data)): for j in range(len(data[i])): data[i][j] = float(data[i][j]) return data def splitData(data): colnames = data[0][1:] rownames = [data[x][0] for x in range(1,len(data))] realdata = [data[x][1:] for x in range(1,len(data))] return colnames, rownames, realdata def rmRow(data, i): return data[:i]+data[i+1:] def rmCol(data, i): return [data[x][:i]+data[x][i+1:] for x in range(len(data))] def getRow(rdata, i): return rdata[i] def getCol(data, i): return [data[x][i] for x in range(len(data))] def writeOut(outfile, data): with open(outfile, 'w') as ofile: for line in data: ofile.write('\t'.join([str(cell) for cell in line])+'\n') def mean_sd(values): if len(values) == 0: return 0, 0 mean = float(sum(values))/len(values) return mean, math.sqrt(sum([(val-mean)**2 for val in values])/len(values)) def transpose(data): if len(data) < 1: return [] outdata = [[] for x in range(len(data[0]))] for i in range(len(data)): for j in range(len(data[i])): outdata[j].append(data[i][j]) return outdata def makeNiceTable(rownames, colnames, numbers): print 'printing table', len(rownames), "rownames,", len(colnames), "columnnames, table size", len(numbers),"x",len(numbers[0]) out = [['sample'] + colnames] for i in range(len(rownames)): out.append([rownames[i]]+numbers[i]) return out def paste2Table(table, colname, values): table[0].append(colname) for i in range(1, len(table)): table[i].append(values[i-1]) return table def getChromIndexes(filelink, chromInTable): data = [] with open(filelink, 'r') as ifile: for line in ifile: data.append(chromInTable.index(line.strip().split('\t')[0])) return data def getRefIndexes(filelink, samplelist, defaultGender): allRef = [] boys = [] girls = [] with open(filelink, 'r') as ifile: for line in ifile: lineContent = line.strip().split('\t') targetSample = samplelist.index(lineContent[0]) allRef.append(targetSample) if len(lineContent) > 1: if lineContent[1] == 'female': girls.append(targetSample) if lineContent[1] == 'male': boys.append(targetSample) else: if defaultGender[targetSample] == 'boy': boys.append(targetSample) if defaultGender[targetSample] == 'girl': girls.append(targetSample) return allRef, boys, girls def linearRegression(X, y): # linear regression # b = (Xt * X)-1 * Xt * y ay = numpy.array([y]) aX = numpy.array(X) aXt = aX.transpose() b = numpy.dot(numpy.dot(numpy.linalg.inv(numpy.dot(aXt, aX)), aXt), ay.transpose()) listB = [float(b[x]) for x in range(len(b))] eps = [y[i] - sum([X[i][j]*listB[j] for j in range(len(X[i]))]) for i in range(len(y))] return listB, mean_sd(eps)[0] def covMartix(data): # data = 24 col, 300 row -> corrcoef = 24x24 arData = numpy.array(data) arDataT = arData.transpose() return numpy.ma.cov(arDataT) #### peab olema covariance def Mahalanobis(alldata, refdata): # = sqrt( (xT-mT)T * Si * (xT-mT) ) def avgRows(mx): sums = mx[0][:] for i in range(1, len(mx)): for j in range(len(mx[i])): sums[j] += float(mx[i][j]) return [sums[x]/len(mx) for x in range(len(sums))] m = numpy.array([avgRows(refdata)]) mT = m.transpose() S = covMartix(refdata) Si = numpy.linalg.inv(S) dists = [] for i in range(len(alldata)): x = numpy.array([getRow(alldata, i)]) xT = x.transpose() diff = numpy.subtract(xT, mT) dists.append(math.sqrt(numpy.dot(numpy.dot(diff.transpose(), Si), diff))) return dists def findPredCoverage(ref, alldata, chroms): allPredictions = [] for indexchrom in range(len(chroms)): chrCol = getCol(ref, indexchrom) nonChrColdata = rmCol(ref, indexchrom) beeta, epsilon = linearRegression(nonChrColdata, chrCol) alldata_nonChrm = rmCol(alldata, indexchrom) realCov = getCol(alldata, indexchrom) predCov = [] for i in range(len(alldata_nonChrm)): y = sum([beeta[a]*alldata_nonChrm[i][a] for a in range(len(beeta))]) + epsilon predCov.append(y) allPredictions.append(predCov) result = numpy.array(allPredictions) return result.transpose() def numpyArray2list(data): return [list(row) for row in data] def trimTable(table, colnames, rownames, badcolNames, badRownames): removableColIndex = [] removableRowIndex = [] for badcol in badcolNames: if badcol in colnames: removableColIndex.append(colnames.index(badcol)) for badrow in badRownames: if badrow in rownames: removableRowIndex.append(rownames.index(badrow)) resultdataRows = [] resultRownames = [] for i in range(len(table)): if i not in removableRowIndex: resultdataRows.append(getRow(table, i)) resultRownames.append(rownames[i]) resultdataCols = [] resultColNames = [] for j in range(len(resultdataRows[0])): if j not in removableColIndex and j < len(resultdataRows[0]): resultdataCols.append(getCol(resultdataRows, j)) resultColNames.append(colnames[j]) result = transpose(resultdataCols) return resultRownames, resultColNames, result def normalizeTableByChrom(data, colnames, normalizeBy): cromIndex= [] for col in normalizeBy: if col in colnames: cromIndex.append(colnames.index(col)) if len(cromIndex) > 0: normalized = [] for i in range(len(data)): factor = float(sum([data[i][x] for x in cromIndex])) / len(cromIndex) normalized.append([data[i][j]/factor for j in range(len(data[i]))]) else: normalized = data noUse, newColnames, table = trimTable(normalized, colnames, sample, normalizeBy, []) return newColnames, table def substractAndDivide(pred, actual): p = numpyArray2list(pred) a = numpyArray2list(actual) result = [] for i in range(len(p)): line = [] for j in range(len(p[i])): line.append((a[i][j]-p[i][j])/p[i][j]) result.append(line) return result ### some configurable variables goodCols = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','X','Y','GC'] if options.extra != None: goodCols.extend(options.extra.split(",")) rmRowNames = ["TOTAL","GC"] ### end of configurable variables disabledNormalization = [] ## no fixed chromosomes used to normalize boythreshold = float(options.boy) girlthreshold = float(options.girl) naiveGenderModleRefChromosomes = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22'] data = readIn() ## read everything as text inchromlist, insamplenames, indata = splitData(data) totals = [getRow(indata, insamplenames.index("TOTAL"))] if "GC" in inchromlist: totals[0][inchromlist.index("GC")] = "1.0" ## because this is used to divide and CG does not have a "TOTAL" per se ## make list to clean table from unwanted columns rmColNames = [] for c in range(len(inchromlist)): if inchromlist[c] in goodCols: continue else: rmColNames.append(inchromlist[c]) ## end of making list to clean table from unwanted columns sample, chroms, indata = trimTable(indata, inchromlist, insamplenames, rmColNames, rmRowNames) indata = numData(indata) # make the inner dataset floats notused1, notused2, totals = trimTable(totals, inchromlist, ['totals'], rmColNames, []) totals = numData(totals) # make totals also floats coverageTable = [[indata[i][j]/totals[0][j] for j in range(len(indata[i]))] for i in range(len(indata))] if options.debug: writeOut('1_coverageTable.tsv', makeNiceTable(sample, chroms, coverageTable)) genderModelCoverageTable = coverageTable genderModelChroms = chroms genderCols, genderData = normalizeTableByChrom(genderModelCoverageTable, genderModelChroms, naiveGenderModleRefChromosomes) Ycoverage = getCol(genderData, genderCols.index('Y')) ## getting Y coverage for later sample, chroms2mah, coverageTable2mah = trimTable(coverageTable, chroms, sample, ['X','Y','GC'], []) ## geting only autosomes sample, chroms, coverageTable = trimTable(coverageTable, chroms, sample, ['X','Y'], []) ## getting autosomes and GC content ## gender assessment genders = ['boy' if y > boythreshold else ('girl' if y < girlthreshold else '?') for y in Ycoverage] if options.control: referenceGroupIndexes, referenceBoyIndexes, referenceGirlIndexes = getRefIndexes(options.control, sample, genders) referenceGroupData2mah = [getRow(coverageTable2mah, i) for i in referenceGroupIndexes] referenceGroupData = [getRow(coverageTable, i) for i in referenceGroupIndexes] referenceGroupSamplenames = [sample[a] for a in referenceGroupIndexes] referenceBoyData = [getRow(genderModelCoverageTable, i) for i in referenceBoyIndexes] referenceGirlData = [getRow(genderModelCoverageTable, i) for i in referenceGirlIndexes] else: referenceGroupData2mah = coverageTable2mah referenceGroupData = coverageTable referenceGroupSamplenames = sample referenceBoyIndexes = [] referenceGirlIndexes = [] for i in range(len(genders)): if genders[i] == 'boy': referenceBoyIndexes.append(i) if genders[i] == 'girl': referenceGirlIndexes.append(i) referenceBoyData = [getRow(genderModelCoverageTable, i) for i in referenceBoyIndexes] referenceGirlData = [getRow(genderModelCoverageTable, i) for i in referenceGirlIndexes] print("[WARN] No reference group given! Calculating references based on the whole table.") ## calculating Mahalanobis distance mahcols, mahREFdata = normalizeTableByChrom(referenceGroupData2mah, chroms2mah, disabledNormalization) mahcols, mahALLdata = normalizeTableByChrom(coverageTable2mah, chroms2mah, disabledNormalization) mahalanobis_dist = Mahalanobis(mahALLdata, mahREFdata) ## calculating difference from average reference predictedCoverage = findPredCoverage(referenceGroupData, coverageTable, chroms) actualCoverage = numpy.array(coverageTable) if options.norm: coverageDifference = substractAndDivide(predictedCoverage, actualCoverage) if options.debug: writeOut('3_normalizedDifference.tsv', makeNiceTable(sample, chroms, coverageDifference)) else: coverageDifference = numpyArray2list(numpy.subtract(actualCoverage, predictedCoverage)) if options.debug: writeOut('3_difference.tsv', makeNiceTable(sample, chroms, coverageDifference)) if options.debug: writeOut('2_predictedCoverageTable.tsv', makeNiceTable(sample, chroms, numpyArray2list(predictedCoverage))) ## calculating reference inner variance refPredicted = findPredCoverage(referenceGroupData, referenceGroupData, chroms) refActual = numpy.array(referenceGroupData) if options.norm: refDiff = substractAndDivide(refPredicted.transpose(), refActual.transpose()) if options.debug: writeOut('5_referenceNormalizedDifference.tsv', makeNiceTable(referenceGroupSamplenames, chroms, transpose(refDiff))) else: refDiff = numpyArray2list(numpy.subtract(refActual, refPredicted).transpose()) if options.debug: writeOut('5_referenceDifference.tsv', makeNiceTable(referenceGroupSamplenames, chroms, transpose(refDiff)) ) meanAndSDperChrom = [mean_sd(chromosomeValues) for chromosomeValues in refDiff] if options.debug: writeOut('4_predictedReferenceTable.tsv', makeNiceTable(referenceGroupSamplenames, chroms, numpyArray2list(refPredicted))) writeOut('6_meanAndSD.tsv', makeNiceTable(chroms, ['average','SD'], numpyArray2list(meanAndSDperChrom))) ## Standardizing (z-scoring) zScores = [] for pat in range(len(coverageDifference)): patLine = [] for chrom in range(len(coverageDifference[pat])): zSc = (coverageDifference[pat][chrom]-meanAndSDperChrom[chrom][0]) / meanAndSDperChrom[chrom][1] patLine.append(zSc) if options.debug and abs(zSc) > 4: print('[INFO] chr %2s in pat %22s has z-score %5s' % (chroms[chrom], sample[pat], str(round(zSc,2)))) zScores.append(patLine) #### Making gender models # Standardizing data according to some chromosome boystcols, boyreferenceGroupData = normalizeTableByChrom(referenceBoyData, genderModelChroms, disabledNormalization) girlstcols, girlreferenceGroupData = normalizeTableByChrom(referenceGirlData, genderModelChroms, disabledNormalization) genModChroms, genModCoverageTable = normalizeTableByChrom(genderModelCoverageTable, genderModelChroms, disabledNormalization) # calculating difference from average reference genModActualCoverage = numpy.array(genModCoverageTable) boyModPredictedCoverage = findPredCoverage(boyreferenceGroupData, genModCoverageTable, boystcols) girlModPredictedCoverage = findPredCoverage(girlreferenceGroupData, genModCoverageTable, girlstcols) boyCoverageDifference = numpyArray2list(numpy.subtract(genModActualCoverage, boyModPredictedCoverage)) girlCoverageDifference = numpyArray2list(numpy.subtract(genModActualCoverage, girlModPredictedCoverage)) # calculating reference inner variance boyPredicted = findPredCoverage(boyreferenceGroupData, boyreferenceGroupData, boystcols) boyActual = numpy.array(boyreferenceGroupData) boyDiff = numpyArray2list(numpy.subtract(boyActual, boyPredicted).transpose()) boyMeanAndSDperChrom = [mean_sd(chromosomeValues) for chromosomeValues in boyDiff] girlPredicted = findPredCoverage(girlreferenceGroupData, girlreferenceGroupData, girlstcols) girlActual = numpy.array(girlreferenceGroupData) girlDiff = numpyArray2list(numpy.subtract(girlActual, girlPredicted).transpose()) girlMeanAndSDperChrom = [mean_sd(chromosomeValues) for chromosomeValues in girlDiff] # Standardizing (z-scoring) GenModZ = [] XYindexes = [genModChroms.index('X'), genModChroms.index('Y')] modelGenderPrediction = [] ## coming out [boyModelX, girlModelX, boyModelY, girlModelY] for pat in range(len(coverageDifference)): patLine = [] for chrom in XYindexes: patLine.append((boyCoverageDifference[pat][chrom]-boyMeanAndSDperChrom[chrom][0]) / boyMeanAndSDperChrom[chrom][1]) patLine.append((girlCoverageDifference[pat][chrom]-girlMeanAndSDperChrom[chrom][0]) / girlMeanAndSDperChrom[chrom][1]) GenModZ.append(patLine) modelGenderPrediction.append('boy' if patLine[3] >= 10.0 else ('girl' if patLine[3] <= 5.0 else '?')) GenModZ = transpose(GenModZ) ### pasting info together outdata = makeNiceTable(sample, chroms, zScores) outdata = paste2Table(outdata, 'boyModelX', GenModZ[0]) outdata = paste2Table(outdata, 'girlModelX', GenModZ[1]) outdata = paste2Table(outdata, 'boyModelY', GenModZ[2]) outdata = paste2Table(outdata, 'girlModelY', GenModZ[3]) outdata = paste2Table(outdata, "mahalanobis", mahalanobis_dist) outdata = paste2Table(outdata, 'modelGender', modelGenderPrediction) writeOut(options.outfile, outdata) print('[INFO] Results in file '+options.outfile) print __file__.split("/")[-1]+" finished. Time", int(1000*(time.time()-start_time)), "msec."