|
| 1 | +""" |
| 2 | +Pure ion chromatograms extraction via HDBSCAN |
| 3 | +""" |
| 4 | + |
| 5 | +import time,os,shutil,time |
| 6 | +import numpy as np |
| 7 | +import hdbscan |
| 8 | +from sklearn.preprocessing import minmax_scale |
| 9 | +import pandas as pd |
| 10 | +from collections import deque |
| 11 | +from .mspd import peaks_detection |
| 12 | +from .fileio import readms |
| 13 | + |
| 14 | +def maxI(intensity): |
| 15 | + max_i = np.array([intensity_i[0] if len(intensity_i)>0 else 0 for intensity_i in intensity]) |
| 16 | + max_intensity_rt = np.argmax(max_i) |
| 17 | + max_intensity_intensity = max_i[max_intensity_rt] |
| 18 | + return max_intensity_rt,max_intensity_intensity |
| 19 | + |
| 20 | +def choosedata(ms, intensity, rt, inv_ms, inv_rt, max_intensity_rt, scan): |
| 21 | + max_int_ms =ms[max_intensity_rt][0] |
| 22 | + start = max_intensity_rt-inv_rt |
| 23 | + end = max_intensity_rt+inv_rt |
| 24 | + if start <0: |
| 25 | + start = 0 |
| 26 | + if end>scan: |
| 27 | + end = scan |
| 28 | + choose_spec = [] |
| 29 | + for index in range(start,end): |
| 30 | + ms_index = np.where(np.abs(ms[index]-max_int_ms)<inv_ms)[0] |
| 31 | + if ms_index.shape[0]>0: |
| 32 | + rt_1 = np.full(ms_index.shape[0],rt[index], dtype=np.double) |
| 33 | + choose_spec.extend(zip(rt_1,ms[index][ms_index],intensity[index][ms_index])) |
| 34 | + choose_spec = np.array(choose_spec) |
| 35 | + choose_rt = rt[start:end] |
| 36 | + h_intensity_rt = rt[max_intensity_rt] |
| 37 | + return choose_spec,choose_rt,max_int_ms,h_intensity_rt |
| 38 | + |
| 39 | +def hdbscan_lc(choose_spec, choose_rt, h_intensity_rt, max_int_ms, rt_inv, mis_gap): |
| 40 | + choose_spec_trans = np.zeros_like(choose_spec[:,:2]) |
| 41 | + choose_spec_trans[:,1] = np.abs(choose_spec[:,1]-max_int_ms) |
| 42 | + choose_spec_trans[:,0] = choose_spec[:,0]-h_intensity_rt |
| 43 | + data = minmax_scale(choose_spec_trans) |
| 44 | + data[:,1] = data[:,1]*50 |
| 45 | + clusterer = hdbscan.HDBSCAN(min_cluster_size=5) |
| 46 | + clusterer.fit(data) |
| 47 | + labels = clusterer.labels_ |
| 48 | + label_index = np.argmax(choose_spec[:,2]) |
| 49 | + label = labels[label_index] |
| 50 | + if label == -1: |
| 51 | + choose_spec_1 = choose_spec[label_index].reshape(1,3) |
| 52 | + else: |
| 53 | + choose_spec_1 = choose_spec[labels==label,:] |
| 54 | + choose_spec_1_rt = choose_spec_1[:,0] |
| 55 | + choose_rt_1 =choose_rt[choose_rt.index(choose_spec_1_rt[0]):choose_rt.index(choose_spec_1_rt[-1])+1] |
| 56 | + no_common = np.setdiff1d(np.array(choose_rt_1),choose_spec_1_rt) |
| 57 | + if no_common.shape[0]>mis_gap: |
| 58 | + left = no_common[no_common<h_intensity_rt] |
| 59 | + if left.shape[0]>mis_gap: |
| 60 | + pre_rt = np.searchsorted(choose_spec_1_rt,left[-mis_gap-1]) |
| 61 | + else: |
| 62 | + pre_rt = 0 |
| 63 | + right = no_common[no_common>h_intensity_rt] |
| 64 | + if right.shape[0]>mis_gap: |
| 65 | + dif_rt = np.searchsorted(choose_spec_1_rt,right[mis_gap]) |
| 66 | + else: |
| 67 | + dif_rt = len(choose_spec_1_rt) |
| 68 | + choose_spec_1 = choose_spec_1[pre_rt:dif_rt,:] |
| 69 | + choose_spec_1_rt = choose_spec_1[:,0] |
| 70 | + choose_1_rt = np.diff(choose_spec_1_rt) |
| 71 | + if not np.all(choose_1_rt): |
| 72 | + choose_1_rt_index = list(range(choose_spec_1_rt.shape[0])) |
| 73 | + choose_1_rt = np.where(choose_1_rt ==0)[0] |
| 74 | + choose_spec_1_rt_1 = choose_spec_1_rt[choose_1_rt] |
| 75 | + k= 0 |
| 76 | + while k<choose_spec_1_rt_1.shape[0]: |
| 77 | + p_multiple = choose_spec_1[choose_spec_1_rt==choose_spec_1_rt_1[k]] |
| 78 | + p_index = list(range(choose_1_rt[k],choose_1_rt[k]+ p_multiple.shape[0])) |
| 79 | + p_index.remove(np.argmin(np.abs(p_multiple[:,1]-max_int_ms))+choose_1_rt[k]) |
| 80 | + for index in p_index: |
| 81 | + choose_1_rt_index.remove(index) |
| 82 | + |
| 83 | + k +=(p_multiple.shape[0]-1) |
| 84 | + choose_spec_1= choose_spec_1[choose_1_rt_index,:] |
| 85 | + return choose_spec_1 |
| 86 | + |
| 87 | +def PIC(inputfile, min_intensity=200, mass_inv=1, rt_inv=15, mis_gap=3, max_items=30000): |
| 88 | + ms,intensity,rt,rt_mean_interval=readms(inputfile) |
| 89 | + rt_inv = int(rt_inv/rt_mean_interval) |
| 90 | + scan = len(rt) |
| 91 | + |
| 92 | + pic_list = {} |
| 93 | + while True: |
| 94 | + max_intensity_rt,max_intensity_intensity = maxI(intensity) |
| 95 | + if max_intensity_intensity<min_intensity: |
| 96 | + break |
| 97 | + else: |
| 98 | + choose_spec,choose_rt,max_int_ms,h_intensity_rt = choosedata(ms,intensity,rt,mass_inv,rt_inv,max_intensity_rt,scan) |
| 99 | + if choose_spec.shape[0]<5: |
| 100 | + choose_spec_2= choose_spec[np.argmax(choose_spec[:,2])].reshape(1,3) |
| 101 | + else: |
| 102 | + choose_spec_2 = hdbscan_lc(choose_spec,choose_rt,h_intensity_rt,max_int_ms,rt_inv,mis_gap) |
| 103 | + #np.savetxt('%s/%s_%s_%s_1.txt' % (file_t,max_int_ms,h_intensity_rt,max_intensity_intensity),choose_spec) |
| 104 | + |
| 105 | + # np.savetxt('%s/%s_%s_%s_%s_%s_%s.txt' % (file_t,max_int_ms,h_intensity_rt,max_intensity_intensity,choose_spec_2[0,0],choose_spec_2[-1,0],choose_spec_2.shape[0]), choose_spec_2) |
| 106 | + pic_list['%s_%s_%s_%s_%s_%s' % (max_int_ms,h_intensity_rt,max_intensity_intensity,choose_spec_2[0,0],choose_spec_2[-1,0],choose_spec_2.shape[0])] = choose_spec_2 |
| 107 | + del_1_index = rt.index(choose_spec_2[0,0]) |
| 108 | + del_2_index = rt.index(choose_spec_2[-1,0])+1 |
| 109 | + del_not = np.setdiff1d(np.array(rt[del_1_index:del_2_index]),choose_spec_2[:,0]) |
| 110 | + del_all_ms = choose_spec_2[:,1] |
| 111 | + del_not_count = 0 |
| 112 | + for index,del_ms_index in enumerate(range(del_1_index,del_2_index)): |
| 113 | + if rt[del_ms_index] in del_not: |
| 114 | + del_not_count+=1 |
| 115 | + else: |
| 116 | + index = index-del_not_count |
| 117 | + intensity[del_ms_index] = intensity[del_ms_index][ms[del_ms_index]!=del_all_ms[index]] |
| 118 | + ms[del_ms_index] = ms[del_ms_index][ms[del_ms_index]!=del_all_ms[index]] |
| 119 | + if len(pic_list) >= max_items: |
| 120 | + break |
| 121 | + |
| 122 | + pic_list_refine = {} |
| 123 | + for k, pic in pic_list.items(): |
| 124 | + if len(pic) >= 2: |
| 125 | + pic_list_refine[k] = pic |
| 126 | + |
| 127 | + return rt_mean_interval, pic_list_refine |
| 128 | + |
| 129 | + |
| 130 | +def lc_ms_peak(data, scales, min_snr, data_p, intensity): |
| 131 | + if data_p: |
| 132 | + peak_list = peaks_detection(data, scales, min_snr,intensity) |
| 133 | + else: |
| 134 | + data = np.loadtxt(r'%s' % data) |
| 135 | + peak_list = peaks_detection(data, scales, min_snr,intensity) |
| 136 | + return list(peak_list) |
| 137 | + |
| 138 | + |
| 139 | +def to_deque(pic_list, min_snr, rt_v, intensity): |
| 140 | + number = 10 |
| 141 | + width = np.arange(1,60) |
| 142 | + peak_list = [] |
| 143 | + alls = [] |
| 144 | + files = np.array(list(pic_list.keys())) |
| 145 | + for each in files: |
| 146 | + each = [float(s) for s in each.split('_')] |
| 147 | + alls.append(list(each)) |
| 148 | + frame = np.array(alls) |
| 149 | + files = np.array(files)[np.argsort(frame[:,0])] |
| 150 | + frame = frame[np.argsort(frame[:,0])] |
| 151 | + frame_mz = frame[:,0] |
| 152 | + l_frame = frame.shape[0] |
| 153 | + i = 0 |
| 154 | + count = [] |
| 155 | + new_pic_list = {} |
| 156 | + while i<l_frame: |
| 157 | + if i not in count: |
| 158 | + mz = frame_mz[i] |
| 159 | + mz_range = mz +0.1 |
| 160 | + p_frame = frame[i:,:][frame_mz[i:]<mz_range] |
| 161 | + length = p_frame.shape[0] |
| 162 | + if length >1: |
| 163 | + pre = p_frame[0,3] |
| 164 | + dif = p_frame[0,4] |
| 165 | + condition = np.array(list(p_frame[:,3]-dif) + list(pre-p_frame[:,4])) |
| 166 | + if condition[(condition>0)&(condition<rt_v)].shape[0] >0 : |
| 167 | + file_sep = deque() |
| 168 | + file_sep.append(0) |
| 169 | + a = np.where(((pre-p_frame[:,4])>0)&((pre-p_frame[:,4])<rt_v))[0] |
| 170 | + while a.shape[0]>0 and p_frame[p_frame[:,3]==pre].shape[0]<2: |
| 171 | + file_sep.appendleft(a[0]) |
| 172 | + pre = p_frame[a[0],3] |
| 173 | + a = np.where(((pre-p_frame[:,4])>0)&((pre-p_frame[:,4])<rt_v))[0] |
| 174 | + b = np.where(((p_frame[:,3]-dif)>0)&((p_frame[:,3]-dif)<rt_v))[0] |
| 175 | + while b.shape[0]>0 and p_frame[p_frame[:,4]==dif].shape[0]<2 : |
| 176 | + file_sep.append(b[0]) |
| 177 | + dif = p_frame[b[0],4] |
| 178 | + b = np.where(((p_frame[:,3]-dif)>0)&((p_frame[:,3]-dif)<rt_v))[0] |
| 179 | + new = np.zeros(6) |
| 180 | + new[:3] = p_frame[file_sep[np.argmax(p_frame[file_sep,2])],:3] |
| 181 | + index = [m+i for m in file_sep] |
| 182 | + |
| 183 | + mass = np.vstack([pic_list[file_i] for file_i in list(files[index])]) |
| 184 | + new[3] = mass[0,0] |
| 185 | + new[4] = mass[-1,0] |
| 186 | + new[5] = mass.shape[0] |
| 187 | + if new[5] >=number: |
| 188 | + label = '%s_%s_%s_%s_%s_%s' % (new[0],new[1],new[2],new[3],new[4],new[5]) |
| 189 | + new_pic_list[label] = mass |
| 190 | + peaks = lc_ms_peak(mass,width,min_snr,True,intensity) |
| 191 | + peaks = [np.append(p, label) for p in peaks] |
| 192 | + peak_list.extend(peaks) |
| 193 | + count.extend(index) |
| 194 | + else: |
| 195 | + if frame[i,5]>=number: |
| 196 | + label = files[i] |
| 197 | + mass = pic_list[files[i]] |
| 198 | + new_pic_list[label] = mass |
| 199 | + peaks = lc_ms_peak(mass,width,min_snr,True,intensity) |
| 200 | + peaks = [np.append(p, label) for p in peaks] |
| 201 | + peak_list.extend(peaks) |
| 202 | + else: |
| 203 | + if frame[i,5]>=number: |
| 204 | + label = files[i] |
| 205 | + mass = pic_list[files[i]] |
| 206 | + new_pic_list[label] = mass |
| 207 | + peaks = lc_ms_peak(mass,width,min_snr,True,intensity) |
| 208 | + peaks = [np.append(p, label) for p in peaks] |
| 209 | + peak_list.extend(peaks) |
| 210 | + |
| 211 | + i += 1 |
| 212 | + |
| 213 | + peak_list = pd.DataFrame(peak_list, columns = ['mz','rt','intensity','rt1','rt2','signal','snr','ind','shape','pic_label']) |
| 214 | + for col in peak_list.columns[:-1]: |
| 215 | + peak_list[col] = peak_list[col].astype(float) |
| 216 | + |
| 217 | + return peak_list, new_pic_list |
| 218 | + |
| 219 | + |
| 220 | +def hpic(file_in, min_intensity=250, min_snr=3, mass_inv=1, rt_inv=15): |
| 221 | + """ |
| 222 | + Extract pure ion chromatograms from LC-MS dataset |
| 223 | +
|
| 224 | + Arguments: |
| 225 | + file_in: string |
| 226 | + path to the dataset locally |
| 227 | + file_out: string |
| 228 | + folder to store the extraction result. |
| 229 | + min_intensity: float |
| 230 | + minimum intensity of the peak |
| 231 | + min_snr: float |
| 232 | + minimum signal noise ratio |
| 233 | + mass_inv: float |
| 234 | + minimum interval of the m/z values |
| 235 | + rt_inv: float |
| 236 | + minimum interval of the retention time |
| 237 | +
|
| 238 | + Returns: |
| 239 | + Numpy array: shape = (n, 9). |
| 240 | + n is the number of the extracted PICs, each row is a PIC |
| 241 | + The meaning of each column is as following |
| 242 | + # 0 -> m/z value |
| 243 | + # 1 -> retention time at apex |
| 244 | + # 2 -> intensity at apex |
| 245 | + # 3 -> start of retention time |
| 246 | + # 4 -> end of retention time |
| 247 | + # 5 -> intensity in CWT space |
| 248 | + # 6 -> SNR in CWT space |
| 249 | + # 7 -> index of the peak |
| 250 | + # 8 -> length of the peak |
| 251 | + """ |
| 252 | + |
| 253 | + interval, pic_list = PIC(file_in, min_intensity) |
| 254 | + interval *= 1.5 |
| 255 | + return to_deque(pic_list, min_snr, interval, min_intensity) |
| 256 | + |
| 257 | + |
| 258 | +if __name__ == '__main__': |
| 259 | + |
| 260 | + min_intensity = 5000 |
| 261 | + file_in = "D:/project/PeakEva/data/600mix_pos.mzML" |
| 262 | + |
| 263 | + peak_list, pic_list = hpic(file_in, min_intensity=2000, min_snr=3, mass_inv=1, rt_inv=15) |
| 264 | + |
0 commit comments