#!/usr/bin/env python # -*- coding: utf-8 -*- # # spectrum analyzer.py # # By Karli Reiding # Started: 19-08-2013 # Current: 07-01-2014 import os import math import time # filenames COMPOSITIONS = "compositions.txt" # Name of compositions file containing list of single letter annotations with number (e.g. H5N4E2F1) OUTPUT = "output.txt" # Name of output file # analysis properties MASS_MODIFIERS = ["sodium", "neg_electron", "free"] # Mass changes regardless of compositions CALCULATION_WINDOW = 0.49 # Range to sum around the calculated m/z ISOTOPE_FRACTION = 0.95 # Percentage of isotope distribution to combine NOISE_DISTANCE = [-1.00335] # Relative m/z of noise to isotope cluster # analyses to include PICK_NOISE = 1 # Allow assessing of noise CORRECT_NOISE = 1 # Allow subtraction of noise from isotopes SUM_ISOTOPES = 1 # Sum isotopes to one value RELATIVE = 1 # Normalize values relative to sum GROUP = 1 # Group values in output # other parameters VERBOSE = 0 # Displays processing info to user LIMIT_DATA = 1 # Limits amount of memory used FULL_SPREAD = 1 # Allows precise calculation of data distribution # component properties _13C = {'p':0.01109, 'dmass':1.00335, 'max':10} BLOCKS = {'F':{'mass':146.05790879894, 'carbons':6}, 'H':{'mass':162.0528234185, 'carbons':6}, 'N':{'mass':203.07937251951, 'carbons':8}, 'S':{'mass':291.09541650647, 'carbons':11}, 'l':{'mass':273.08485182277, 'carbons':11}, 'L':{'mass':273.08485182277, 'carbons':11}, 'm':{'mass':305.11106657061, 'carbons':12}, 'M':{'mass':305.11106657061, 'carbons':12}, 'e':{'mass':319.12671663475, 'carbons':13}, 'E':{'mass':319.12671663475, 'carbons':13}, 'aa':{'mass':139.06332853255, 'carbons':7}, 'free':{'mass':18.0105646837, 'carbons':0}, 'proton':{'mass':1.007276466812, 'carbons':0}, 'neg_proton':{'mass':-1.007276466812, 'carbons':0}, 'sodium':{'mass':22.9897692809, 'carbons':0}, 'potassium':{'mass':38.96370668, 'carbons':0}, 'electron':{'mass':0.00054857990946, 'carbons':0}, 'neg_electron':{'mass':-0.00054857990946, 'carbons':0}, 'P':{'mass':79.96633, 'avg_mass':79.96633, 'carbons':0}} INCLUDE = BLOCKS.keys() ########## Data import ########## def _file_import(filename): "Imports lines from file in list." if VERBOSE: print "\tImporting " + filename + "...", lines = [] with open(filename) as f: for line in f: lines.append(line.rstrip("\n")) if VERBOSE: print "done!" return lines def _files_index(): "Indexes local directory for .dat files." if VERBOSE: print "\tIndexing directory...", files = [] for i, val in enumerate(os.listdir(".")): if ".dat" in val: files.append(val) if not files: print "error - no .dat files found in directory." raw_input() quit() files.sort() if VERBOSE: print "done!" return files def _data_grab(files): "Grabs data from files and sorts into dictionary." data = {} for fn in files: if VERBOSE: print "\tReading from " + fn + "...", fname = fn.split(".")[0] data[fname] = {} data[fname]["mass"] = [] data[fname]["intensity"] = [] with open(fn) as f: for line in f: line = line.split() if len(line) > 2: print "error - wrong data format." raw_input() quit() data[fname]["mass"].append(float(line[0])) data[fname]["intensity"].append(int(line[1])) if VERBOSE: print "done!" return data ########## Composition calculation ########## def _compositions_split(composition_list): "Spits composition names into residue packets." if VERBOSE: print "\tSplitting compositions in monosaccharide units...", split_list = [] for composition in composition_list: split_dict = {} split_dict['composition'] = composition[:] for residue in INCLUDE: if composition.find(residue) != -1: comp_size = len(composition) count = composition.find(residue) + 1 num = "" while count < comp_size and composition[count].isdigit(): num += composition[count] count += 1 split_dict[residue] = int(num) else: split_dict[residue] = 0 split_list.append(split_dict) if VERBOSE: print "done!" return split_list def _compositions_mass(compositions): "Calculate exact composition masses." if VERBOSE: print "\tCalculating composition masses...", for i, val in enumerate(compositions): compositions[i]['mass'] = 0 for residue in INCLUDE: compositions[i]['mass'] += compositions[i][residue] * BLOCKS[residue]['mass'] for j in MASS_MODIFIERS: compositions[i]['mass'] += BLOCKS[j]['mass'] if VERBOSE: print "done!" return compositions def _compositions_carbons(compositions): if VERBOSE: print "\tCounting carbons...", for i, val in enumerate(compositions): compositions[i]['carbons'] = 0 for residue in INCLUDE: compositions[i]['carbons'] += compositions[i][residue] * BLOCKS[residue]['carbons'] for j in MASS_MODIFIERS: compositions[i]['carbons'] += BLOCKS[j]['carbons'] if VERBOSE: print "done!" return compositions def _binom(n,k,p): "Binomial function: chance f of exact k successes in n trials with probability p." nCk = math.factorial(n) / (math.factorial(k) * math.factorial(n - k)) f = nCk * p**k * (1 - p)**(n-k) return f def _compositions_isotopes(compositions): if VERBOSE: print "\tCalculating isotope profiles...", for i, val in enumerate(compositions): n = compositions[i]['carbons'] mz = compositions[i]['mass'] p = _13C['p'] m = _13C['dmass'] compositions[i]['p_iso'] = [] compositions[i]['m_iso'] = [] for k in range(_13C['max']): compositions[i]['m_iso'].append(mz + k * m) compositions[i]['p_iso'].append(_binom(n,k,p)) if VERBOSE: print "done!" return compositions def _compositions_major_isotopes(compositions): "Define major isotopes by checking cumulative isotope distribution against" "the indicated sum." if VERBOSE: print "\tDefining major isotopes...", for i, val in enumerate(compositions): count = 0 i_sum = 0 while i_sum < ISOTOPE_FRACTION: i_sum += compositions[i]['p_iso'][count] count += 1 compositions[i]['n_iso'] = count if VERBOSE: print "done!" return compositions ########## Data processing ########## def _analysis_construct(files, data, compositions, analysis): "Construct dictionary for storing analysis data." if VERBOSE: print "\tOrganizing data...", new = 0 if not analysis: analysis = {} new = 1 # Index files analysis['files'] = [] for i in files: analysis['files'].append(i.split(".")[0]) for i in analysis['files']: analysis[i] = {} # Index compositions, isotopes and masses if new: analysis['compositions'] = [] analysis['isotopes'] = [] analysis['masses'] = [] for i in compositions: analysis['compositions'].append(i['composition']) analysis['isotopes'].append(i['n_iso']) analysis['masses'].append(i['m_iso'][:]) # Index raw data layout for i in analysis['files']: analysis[i]['range_lower'] = data[i]['mass'][0] analysis[i]['range_upper'] = data[i]['mass'][-1] if FULL_SPREAD: lower = 100.0 upper = 0.0 for j, mass in enumerate(data[i]['mass'][:-1]): if data[i]['mass'][j+1] - mass < lower: lower = data[i]['mass'][j+1] - mass if data[i]['mass'][j+1] - mass > upper: upper = data[i]['mass'][j+1] - mass if lower == 0: lower = 0.0001 if upper == 0: upper = 0.0001 analysis[i]['spread_lower'] = lower analysis[i]['spread_upper'] = upper else: analysis[i]['spread_lower'] = data[i]['mass'][11] - data[i]['mass'][10] analysis[i]['spread_upper'] = data[i]['mass'][-10] - data[i]['mass'][-11] # Construct empty data format for i in analysis['files']: for j, val in enumerate(analysis['compositions']): analysis[i][val] = analysis['isotopes'][j]*[0] if VERBOSE: print "done!" return analysis def _analysis_peakpick_sum(analysis, data): "Sum raw data within defined window." for i, filename in enumerate(analysis['files']): data_size = len(data[filename]['mass']) if VERBOSE: print "\tPicking peaks from " + filename + "...", for j, composition in enumerate(analysis['compositions']): for k in range(analysis['isotopes'][j]): mass_lower = analysis['masses'][j][k] - CALCULATION_WINDOW mass_upper = analysis['masses'][j][k] + CALCULATION_WINDOW index_lower = mass_lower - analysis[filename]['range_lower'] index_lower = index_lower / analysis[filename]['spread_upper'] index_upper = mass_upper - analysis[filename]['range_lower'] index_upper = index_upper / analysis[filename]['spread_lower'] index_lower = int(index_lower) index_upper = int(index_upper) if index_upper > data_size: index_upper = data_size for m in range(index_lower, index_upper): if mass_lower < data[filename]['mass'][m] < mass_upper: analysis[filename][composition][k] += data[filename]['intensity'][m] # Noise picking if PICK_NOISE: analysis[filename]['noise'] = {} for j, composition in enumerate(analysis['compositions']): analysis[filename]['noise'][composition] = [0] * len(NOISE_DISTANCE) for k, noise_distance in enumerate(NOISE_DISTANCE): mass_lower = analysis['masses'][j][0] - CALCULATION_WINDOW + noise_distance mass_upper = analysis['masses'][j][0] + CALCULATION_WINDOW + noise_distance index_lower = mass_lower - analysis[filename]['range_lower'] index_lower = index_lower / analysis[filename]['spread_upper'] index_upper = mass_upper - analysis[filename]['range_lower'] index_upper = index_upper / analysis[filename]['spread_lower'] index_lower = int(index_lower) index_upper = int(index_upper) if index_upper > data_size: index_upper = data_size for m in range(index_lower, index_upper): if mass_lower < data[filename]['mass'][m] < mass_upper: analysis[filename]['noise'][composition][k] += data[filename]['intensity'][m] if VERBOSE: print "done!" return analysis def _analysis_correct_noise(analysis): "Subtracts average noise from individual isotopes." if VERBOSE: print "\tAdjusting intensities for noise...", for i, filename in enumerate(analysis['files']): analysis[filename]['noiseadjust'] = {} for j, composition in enumerate(analysis['compositions']): avg_noise = sum(analysis[filename]['noise'][composition]) / len(NOISE_DISTANCE) analysis[filename]['noiseadjust'][composition] = [] for k, isotope in enumerate(analysis[filename][composition]): analysis[filename]['noiseadjust'][composition].append(isotope) analysis[filename]['noiseadjust'][composition][k] -= avg_noise if analysis[filename]['noiseadjust'][composition][k] < 0: analysis[filename]['noiseadjust'][composition][k] = 0 if VERBOSE: print "done!" return analysis def _analysis_isotopes_sum(analysis): "Sums isotopes per composition." if VERBOSE: print "\tSumming isotopes...", for i, filename in enumerate(analysis['files']): analysis[filename]['i_sums'] = [] for j, composition in enumerate(analysis['compositions']): analysis[filename]['i_sums'].append(sum(analysis[filename][composition])) if CORRECT_NOISE: for i, filename in enumerate(analysis['files']): analysis[filename]['noiseadjust_i_sums'] = [] for j, composition in enumerate(analysis['compositions']): analysis[filename]['noiseadjust_i_sums'].append(sum(analysis[filename]['noiseadjust'][composition])) if VERBOSE: print "done!" return analysis def _analysis_relative(analysis): "Computes fractions of cumulative spectra." if VERBOSE: print "\tComputing relative intensities...", for i, filename in enumerate(analysis['files']): analysis[filename]['rel_i_sums'] = [] total_sum = float(sum(analysis[filename]['i_sums'])) if total_sum == 0: total_sum = 1 for j in analysis[filename]['i_sums']: analysis[filename]['rel_i_sums'].append(j / total_sum) if CORRECT_NOISE: for i, filename in enumerate(analysis['files']): analysis[filename]['noiseadjust_rel_i_sums'] = [] total_sum = float(sum(analysis[filename]['noiseadjust_i_sums'])) if total_sum == 0: total_sum = 1 for j in analysis[filename]['noiseadjust_i_sums']: analysis[filename]['noiseadjust_rel_i_sums'].append(j / total_sum) if VERBOSE: print "done!" return analysis ########## Writing ########## def _export(analysis): "Outputs analysis data into file." "composition - mz - i0 - i1 - i2 - i3 - sum - relative sum" if VERBOSE: print "\tWriting to " + OUTPUT + "...", max_isotope = max(analysis['isotopes']) # Constructing data header header = ['composition','m/z'] for i in range(max_isotope): header.append("i"+str(i)) if SUM_ISOTOPES: header.append("sum") if RELATIVE: header.append("rel") if PICK_NOISE: for i in NOISE_DISTANCE: header.append(str(i) + " noise") header.append("avg.noise") if CORRECT_NOISE: for i in range(max_isotope):header.append("corr. i"+str(i)) if SUM_ISOTOPES: header.append("corr. sum") if RELATIVE: header.append("corr. rel") header_line = "" for i in header: header_line += i + "\t" header_line = (header_line + "\t") * len(analysis['files']) header_line = header_line[:-1] + "\n" # Constructing file header file_line = "" for i, filename in enumerate(analysis['files']): file_line += filename file_line += "\t" * len(header) + "\t" file_line = file_line[:-1] + "\n" with open(OUTPUT,"w") as f: f.write(file_line) f.write(header_line) for j, composition in enumerate(analysis['compositions']): line = "" for i, filename in enumerate(analysis['files']): # composition line += composition + "\t" # m/z line += str(analysis['masses'][j][0]) + "\t" # isotopes iso_counter = 0 for k, value in enumerate(analysis[filename][composition]): line += str(value) + "\t" iso_counter += 1 while iso_counter < max_isotope: line += "\t" iso_counter += 1 # sum if SUM_ISOTOPES: line += str(analysis[filename]["i_sums"][j]) + "\t" # relative sum if SUM_ISOTOPES and RELATIVE: line += str(analysis[filename]["rel_i_sums"][j]) + "\t" if PICK_NOISE: for k, value in enumerate(analysis[filename]['noise'][composition]): line += str(value) + "\t" line += str(sum(analysis[filename]['noise'][composition]) / len(analysis[filename]['noise'][composition])) line += "\t" # noise corrected if CORRECT_NOISE: iso_counter = 0 for k, value in enumerate(analysis[filename]['noiseadjust'][composition]): line += str(value) + "\t" iso_counter += 1 while iso_counter < max_isotope: line += "\t" iso_counter += 1 if SUM_ISOTOPES: line += str(analysis[filename]["noiseadjust_i_sums"][j]) + "\t" if RELATIVE: line += str(analysis[filename]["noiseadjust_rel_i_sums"][j]) + "\t" # extra line += "\t" # finishing line = line[:-1] + "\n" f.write(line) # Display values together if GROUP: display = [] if CORRECT_NOISE: display.append("noiseadjust_i_sums") if RELATIVE: display.append("noiseadjust_rel_i_sums") else: display.append("i_sums") if RELATIVE: display.append("rel_i_sums") f.write("\n") # Header for mode in display: line = "" line += mode + "\n" line += "composition" + "\t" line += "m/z" + "\t" for i in analysis["files"]: line += i + "\t" line = line[:-1] + "\n" f.write(line) # Data for j, composition in enumerate(analysis["compositions"]): line = "" line += composition + "\t" line += str(analysis["masses"][j][0]) + "\t" for i, filename in enumerate(analysis["files"]): line += str(analysis[filename][mode][j]) + "\t" line = line[:-1] + "\n" f.write(line) f.write("\n") if VERBOSE: print "done!" return ########## Main ########## def main(): time_start = time.clock() if VERBOSE: print "Indexing compositions:" compositions = _compositions_split(_file_import(COMPOSITIONS)) compositions = _compositions_mass(compositions) compositions = _compositions_carbons(compositions) compositions = _compositions_isotopes(compositions) compositions = _compositions_major_isotopes(compositions) if VERBOSE: print "Finding files:" files = _files_index() if LIMIT_DATA: analysis = {} for i, filename in enumerate(files): if VERBOSE: print "Loading data:" else: print "Processing " + filename data = _data_grab([filename]) if VERBOSE: print "Processing data:" analysis = _analysis_construct([filename], data, compositions, analysis) analysis = _analysis_peakpick_sum(analysis, data) if CORRECT_NOISE: analysis = _analysis_correct_noise(analysis) if SUM_ISOTOPES: analysis = _analysis_isotopes_sum(analysis) if SUM_ISOTOPES and RELATIVE: analysis = _analysis_relative(analysis) analysis['files'] = [] for i in files: analysis['files'].append(i.split(".")[0]) else: if VERBOSE: print "Loading data:" data = _data_grab(files) if VERBOSE: print "Processing data:" analysis = _analysis_construct(files, data, compositions) analysis = _analysis_peakpick_sum(analysis, data) if CORRECT_NOISE: analysis = _analysis_correct_noise(analysis) if SUM_ISOTOPES: analysis = _analysis_isotopes_sum(analysis) if SUM_ISOTOPES and RELATIVE: analysis = _analysis_relative(analysis) if VERBOSE: print "Exporting data:" _export(analysis) print "Finished - total analysis time: " + str(round(time.clock() - time_start, 2)) + " seconds." raw_input() return 0 if __name__ == "__main__": main()