Skip to content

Commit 1b12279

Browse files
committed
Initial commit
0 parents  commit 1b12279

17 files changed

+1106
-0
lines changed

._.DS_Store

4 KB
Binary file not shown.

.gitattributes

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# Auto detect text files and perform LF normalization
2+
* text=auto

.gitignore

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
/data/
2+
/dist/
3+
/setup/
4+
/.spyproject/
5+
*.pyc
6+
*.h5
7+
*.npy
8+
*.npz

automs/fileio.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
"""
2+
Module to parse mzML files based on pymzml
3+
"""
4+
# pylint: disable=no-name-in-module
5+
import os
6+
import pymzml
7+
import numpy as np
8+
9+
10+
def readms(file_path):
11+
"""
12+
Read mzML files.
13+
Arguments:
14+
file_path: string
15+
path to the dataset locally
16+
Returns:
17+
Tuple of Numpy arrays: (m/z, intensity, retention time, mean interval of retention time).
18+
19+
Examples:
20+
>>> from hpic.fileio import readms
21+
>>> ms,intensity,rt,rt_mean_interval = readms("MM14_20um.mzxml")
22+
"""
23+
ms_format = os.path.splitext(file_path)[1]
24+
ms_format = ms_format.lower()
25+
if ms_format == '.mzml':
26+
run = pymzml.run.Reader(file_path)
27+
else:
28+
raise Exception('ERROR: %s is wrong format' % file_path)
29+
for n, spec in enumerate(run):
30+
pass
31+
32+
m_s = []
33+
intensity = []
34+
r_t = []
35+
for spectrum in run:
36+
if spectrum.ms_level == 1:
37+
if spectrum.scan_time[1] == 'minute':
38+
r_t.append(spectrum.scan_time[0] * 60)
39+
elif spectrum.scan_time[1] == 'second':
40+
r_t.append(spectrum.scan_time[0])
41+
else:
42+
raise Exception('ERROR: scan time unit is wrong format')
43+
p_ms = []
44+
p_intensity = []
45+
for peak in spectrum.centroidedPeaks:
46+
if peak[1] != 0:
47+
p_ms.append(peak[0])
48+
p_intensity.append(peak[1])
49+
ms_index = np.argsort(np.negative(p_intensity))
50+
m_s.append(np.array(p_ms)[ms_index])
51+
intensity.append(np.array(p_intensity)[ms_index])
52+
rt1 = np.array(r_t)
53+
if rt1.shape[0] > 1:
54+
rt_mean_interval = np.mean(np.diff(rt1))
55+
else:
56+
rt_mean_interval = 0.0
57+
return m_s, intensity, r_t, rt_mean_interval

automs/hpic.py

Lines changed: 264 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,264 @@
1+
"""
2+
Pure ion chromatograms extraction via HDBSCAN
3+
"""
4+
5+
import time,os,shutil,time
6+
import numpy as np
7+
import hdbscan
8+
from sklearn.preprocessing import minmax_scale
9+
import pandas as pd
10+
from collections import deque
11+
from .mspd import peaks_detection
12+
from .fileio import readms
13+
14+
def maxI(intensity):
15+
max_i = np.array([intensity_i[0] if len(intensity_i)>0 else 0 for intensity_i in intensity])
16+
max_intensity_rt = np.argmax(max_i)
17+
max_intensity_intensity = max_i[max_intensity_rt]
18+
return max_intensity_rt,max_intensity_intensity
19+
20+
def choosedata(ms, intensity, rt, inv_ms, inv_rt, max_intensity_rt, scan):
21+
max_int_ms =ms[max_intensity_rt][0]
22+
start = max_intensity_rt-inv_rt
23+
end = max_intensity_rt+inv_rt
24+
if start <0:
25+
start = 0
26+
if end>scan:
27+
end = scan
28+
choose_spec = []
29+
for index in range(start,end):
30+
ms_index = np.where(np.abs(ms[index]-max_int_ms)<inv_ms)[0]
31+
if ms_index.shape[0]>0:
32+
rt_1 = np.full(ms_index.shape[0],rt[index], dtype=np.double)
33+
choose_spec.extend(zip(rt_1,ms[index][ms_index],intensity[index][ms_index]))
34+
choose_spec = np.array(choose_spec)
35+
choose_rt = rt[start:end]
36+
h_intensity_rt = rt[max_intensity_rt]
37+
return choose_spec,choose_rt,max_int_ms,h_intensity_rt
38+
39+
def hdbscan_lc(choose_spec, choose_rt, h_intensity_rt, max_int_ms, rt_inv, mis_gap):
40+
choose_spec_trans = np.zeros_like(choose_spec[:,:2])
41+
choose_spec_trans[:,1] = np.abs(choose_spec[:,1]-max_int_ms)
42+
choose_spec_trans[:,0] = choose_spec[:,0]-h_intensity_rt
43+
data = minmax_scale(choose_spec_trans)
44+
data[:,1] = data[:,1]*50
45+
clusterer = hdbscan.HDBSCAN(min_cluster_size=5)
46+
clusterer.fit(data)
47+
labels = clusterer.labels_
48+
label_index = np.argmax(choose_spec[:,2])
49+
label = labels[label_index]
50+
if label == -1:
51+
choose_spec_1 = choose_spec[label_index].reshape(1,3)
52+
else:
53+
choose_spec_1 = choose_spec[labels==label,:]
54+
choose_spec_1_rt = choose_spec_1[:,0]
55+
choose_rt_1 =choose_rt[choose_rt.index(choose_spec_1_rt[0]):choose_rt.index(choose_spec_1_rt[-1])+1]
56+
no_common = np.setdiff1d(np.array(choose_rt_1),choose_spec_1_rt)
57+
if no_common.shape[0]>mis_gap:
58+
left = no_common[no_common<h_intensity_rt]
59+
if left.shape[0]>mis_gap:
60+
pre_rt = np.searchsorted(choose_spec_1_rt,left[-mis_gap-1])
61+
else:
62+
pre_rt = 0
63+
right = no_common[no_common>h_intensity_rt]
64+
if right.shape[0]>mis_gap:
65+
dif_rt = np.searchsorted(choose_spec_1_rt,right[mis_gap])
66+
else:
67+
dif_rt = len(choose_spec_1_rt)
68+
choose_spec_1 = choose_spec_1[pre_rt:dif_rt,:]
69+
choose_spec_1_rt = choose_spec_1[:,0]
70+
choose_1_rt = np.diff(choose_spec_1_rt)
71+
if not np.all(choose_1_rt):
72+
choose_1_rt_index = list(range(choose_spec_1_rt.shape[0]))
73+
choose_1_rt = np.where(choose_1_rt ==0)[0]
74+
choose_spec_1_rt_1 = choose_spec_1_rt[choose_1_rt]
75+
k= 0
76+
while k<choose_spec_1_rt_1.shape[0]:
77+
p_multiple = choose_spec_1[choose_spec_1_rt==choose_spec_1_rt_1[k]]
78+
p_index = list(range(choose_1_rt[k],choose_1_rt[k]+ p_multiple.shape[0]))
79+
p_index.remove(np.argmin(np.abs(p_multiple[:,1]-max_int_ms))+choose_1_rt[k])
80+
for index in p_index:
81+
choose_1_rt_index.remove(index)
82+
83+
k +=(p_multiple.shape[0]-1)
84+
choose_spec_1= choose_spec_1[choose_1_rt_index,:]
85+
return choose_spec_1
86+
87+
def PIC(inputfile, min_intensity=200, mass_inv=1, rt_inv=15, mis_gap=3, max_items=30000):
88+
ms,intensity,rt,rt_mean_interval=readms(inputfile)
89+
rt_inv = int(rt_inv/rt_mean_interval)
90+
scan = len(rt)
91+
92+
pic_list = {}
93+
while True:
94+
max_intensity_rt,max_intensity_intensity = maxI(intensity)
95+
if max_intensity_intensity<min_intensity:
96+
break
97+
else:
98+
choose_spec,choose_rt,max_int_ms,h_intensity_rt = choosedata(ms,intensity,rt,mass_inv,rt_inv,max_intensity_rt,scan)
99+
if choose_spec.shape[0]<5:
100+
choose_spec_2= choose_spec[np.argmax(choose_spec[:,2])].reshape(1,3)
101+
else:
102+
choose_spec_2 = hdbscan_lc(choose_spec,choose_rt,h_intensity_rt,max_int_ms,rt_inv,mis_gap)
103+
#np.savetxt('%s/%s_%s_%s_1.txt' % (file_t,max_int_ms,h_intensity_rt,max_intensity_intensity),choose_spec)
104+
105+
# np.savetxt('%s/%s_%s_%s_%s_%s_%s.txt' % (file_t,max_int_ms,h_intensity_rt,max_intensity_intensity,choose_spec_2[0,0],choose_spec_2[-1,0],choose_spec_2.shape[0]), choose_spec_2)
106+
pic_list['%s_%s_%s_%s_%s_%s' % (max_int_ms,h_intensity_rt,max_intensity_intensity,choose_spec_2[0,0],choose_spec_2[-1,0],choose_spec_2.shape[0])] = choose_spec_2
107+
del_1_index = rt.index(choose_spec_2[0,0])
108+
del_2_index = rt.index(choose_spec_2[-1,0])+1
109+
del_not = np.setdiff1d(np.array(rt[del_1_index:del_2_index]),choose_spec_2[:,0])
110+
del_all_ms = choose_spec_2[:,1]
111+
del_not_count = 0
112+
for index,del_ms_index in enumerate(range(del_1_index,del_2_index)):
113+
if rt[del_ms_index] in del_not:
114+
del_not_count+=1
115+
else:
116+
index = index-del_not_count
117+
intensity[del_ms_index] = intensity[del_ms_index][ms[del_ms_index]!=del_all_ms[index]]
118+
ms[del_ms_index] = ms[del_ms_index][ms[del_ms_index]!=del_all_ms[index]]
119+
if len(pic_list) >= max_items:
120+
break
121+
122+
pic_list_refine = {}
123+
for k, pic in pic_list.items():
124+
if len(pic) >= 2:
125+
pic_list_refine[k] = pic
126+
127+
return rt_mean_interval, pic_list_refine
128+
129+
130+
def lc_ms_peak(data, scales, min_snr, data_p, intensity):
131+
if data_p:
132+
peak_list = peaks_detection(data, scales, min_snr,intensity)
133+
else:
134+
data = np.loadtxt(r'%s' % data)
135+
peak_list = peaks_detection(data, scales, min_snr,intensity)
136+
return list(peak_list)
137+
138+
139+
def to_deque(pic_list, min_snr, rt_v, intensity):
140+
number = 10
141+
width = np.arange(1,60)
142+
peak_list = []
143+
alls = []
144+
files = np.array(list(pic_list.keys()))
145+
for each in files:
146+
each = [float(s) for s in each.split('_')]
147+
alls.append(list(each))
148+
frame = np.array(alls)
149+
files = np.array(files)[np.argsort(frame[:,0])]
150+
frame = frame[np.argsort(frame[:,0])]
151+
frame_mz = frame[:,0]
152+
l_frame = frame.shape[0]
153+
i = 0
154+
count = []
155+
new_pic_list = {}
156+
while i<l_frame:
157+
if i not in count:
158+
mz = frame_mz[i]
159+
mz_range = mz +0.1
160+
p_frame = frame[i:,:][frame_mz[i:]<mz_range]
161+
length = p_frame.shape[0]
162+
if length >1:
163+
pre = p_frame[0,3]
164+
dif = p_frame[0,4]
165+
condition = np.array(list(p_frame[:,3]-dif) + list(pre-p_frame[:,4]))
166+
if condition[(condition>0)&(condition<rt_v)].shape[0] >0 :
167+
file_sep = deque()
168+
file_sep.append(0)
169+
a = np.where(((pre-p_frame[:,4])>0)&((pre-p_frame[:,4])<rt_v))[0]
170+
while a.shape[0]>0 and p_frame[p_frame[:,3]==pre].shape[0]<2:
171+
file_sep.appendleft(a[0])
172+
pre = p_frame[a[0],3]
173+
a = np.where(((pre-p_frame[:,4])>0)&((pre-p_frame[:,4])<rt_v))[0]
174+
b = np.where(((p_frame[:,3]-dif)>0)&((p_frame[:,3]-dif)<rt_v))[0]
175+
while b.shape[0]>0 and p_frame[p_frame[:,4]==dif].shape[0]<2 :
176+
file_sep.append(b[0])
177+
dif = p_frame[b[0],4]
178+
b = np.where(((p_frame[:,3]-dif)>0)&((p_frame[:,3]-dif)<rt_v))[0]
179+
new = np.zeros(6)
180+
new[:3] = p_frame[file_sep[np.argmax(p_frame[file_sep,2])],:3]
181+
index = [m+i for m in file_sep]
182+
183+
mass = np.vstack([pic_list[file_i] for file_i in list(files[index])])
184+
new[3] = mass[0,0]
185+
new[4] = mass[-1,0]
186+
new[5] = mass.shape[0]
187+
if new[5] >=number:
188+
label = '%s_%s_%s_%s_%s_%s' % (new[0],new[1],new[2],new[3],new[4],new[5])
189+
new_pic_list[label] = mass
190+
peaks = lc_ms_peak(mass,width,min_snr,True,intensity)
191+
peaks = [np.append(p, label) for p in peaks]
192+
peak_list.extend(peaks)
193+
count.extend(index)
194+
else:
195+
if frame[i,5]>=number:
196+
label = files[i]
197+
mass = pic_list[files[i]]
198+
new_pic_list[label] = mass
199+
peaks = lc_ms_peak(mass,width,min_snr,True,intensity)
200+
peaks = [np.append(p, label) for p in peaks]
201+
peak_list.extend(peaks)
202+
else:
203+
if frame[i,5]>=number:
204+
label = files[i]
205+
mass = pic_list[files[i]]
206+
new_pic_list[label] = mass
207+
peaks = lc_ms_peak(mass,width,min_snr,True,intensity)
208+
peaks = [np.append(p, label) for p in peaks]
209+
peak_list.extend(peaks)
210+
211+
i += 1
212+
213+
peak_list = pd.DataFrame(peak_list, columns = ['mz','rt','intensity','rt1','rt2','signal','snr','ind','shape','pic_label'])
214+
for col in peak_list.columns[:-1]:
215+
peak_list[col] = peak_list[col].astype(float)
216+
217+
return peak_list, new_pic_list
218+
219+
220+
def hpic(file_in, min_intensity=250, min_snr=3, mass_inv=1, rt_inv=15):
221+
"""
222+
Extract pure ion chromatograms from LC-MS dataset
223+
224+
Arguments:
225+
file_in: string
226+
path to the dataset locally
227+
file_out: string
228+
folder to store the extraction result.
229+
min_intensity: float
230+
minimum intensity of the peak
231+
min_snr: float
232+
minimum signal noise ratio
233+
mass_inv: float
234+
minimum interval of the m/z values
235+
rt_inv: float
236+
minimum interval of the retention time
237+
238+
Returns:
239+
Numpy array: shape = (n, 9).
240+
n is the number of the extracted PICs, each row is a PIC
241+
The meaning of each column is as following
242+
# 0 -> m/z value
243+
# 1 -> retention time at apex
244+
# 2 -> intensity at apex
245+
# 3 -> start of retention time
246+
# 4 -> end of retention time
247+
# 5 -> intensity in CWT space
248+
# 6 -> SNR in CWT space
249+
# 7 -> index of the peak
250+
# 8 -> length of the peak
251+
"""
252+
253+
interval, pic_list = PIC(file_in, min_intensity)
254+
interval *= 1.5
255+
return to_deque(pic_list, min_snr, interval, min_intensity)
256+
257+
258+
if __name__ == '__main__':
259+
260+
min_intensity = 5000
261+
file_in = "D:/project/PeakEva/data/600mix_pos.mzML"
262+
263+
peak_list, pic_list = hpic(file_in, min_intensity=2000, min_snr=3, mass_inv=1, rt_inv=15)
264+

0 commit comments

Comments
 (0)