forked from tesorrells/RF-Drone-Detection
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhackrf_sweep-process-data.py
executable file
·242 lines (193 loc) · 9.36 KB
/
hackrf_sweep-process-data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
import os
from typing import List
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve
def dt_lookup(s):
"""
Helper func. This is an extremely fast approach to datetime parsing.
For large data, the same dates are often repeated. Rather than
re-parse these, we store all unique dates, parse them, and
use a lookup to convert all dates.
https://stackoverflow.com/questions/29882573/pandas-slow-date-conversion
"""
return s.map({date: pd.to_datetime(date) for date in s.unique()})
def read_hackrf_sweep_file_and_merge(path) -> pd.DataFrame:
"""
Takes in the path to an output file from hackrf_sweep. hackrf_sweep spreads the output of a single sweep over
multiple lines. This combines those lines. So that:
- one row is one whole sweep (180 bins)
- row indexed by datetime timestamp
- columns (ie bins) are floor(starting_bin_freq_hz)
With current hackrf_sweep settings, this yields a dataframe with 180 rows
:param path: filepath_or_buffer : str, path object, or file-like object
:return: pd.DataFrame
"""
# grouped.get_group('2019-02-15 11:03:17.299693').values[:, 6:].ravel() # todo delete this line
# Read CSV hackrf_sweep output to Pandas DataFrame
hackrf_df: pd.DataFrame = pd.read_csv(path, header=None, low_memory=False)
# Index everything by datetime
datetime = pd.DatetimeIndex(dt_lookup(hackrf_df[0] + hackrf_df[1]))
hackrf_df.set_index(datetime, inplace=True)
# Group according to datetime timestamp
# find the number of bins we are expecting, so we can discard the others. Use mode for this
expected_num_bins = hackrf_df.groupby(hackrf_df.index).apply(lambda x: x.values[:, 6:].ravel().size).mode()
# filter out incomplete sweeps. ie where < 180 bins
merged_df = hackrf_df.groupby(hackrf_df.index).filter(lambda x: x.values[:, 6:].ravel().size == expected_num_bins)
# combine multiple rows corresponding to a single sweep/timestamp into one row, datetime indexed
merged_df = merged_df.groupby(merged_df.index).apply(lambda x: pd.Series(x.values[:, 6:].ravel()))
num_bins = merged_df.shape[1]
# set col name to min freq for each sample i.e. sample starting at 2400000000 hz
min_freq = hackrf_df[2].min()
max_freq = hackrf_df[3].max()
bin_size = (max_freq - min_freq) / num_bins # in hz
merged_df.columns = [int((min_freq + x * bin_size)) for x in range(0, num_bins)]
# FIX for scatterplot -- make sure it contains FLOATS not STRINGS!
merged_df = merged_df.astype(float)
return merged_df
def get_mean_by_bin(df: pd.DataFrame) -> pd.Series:
"""
Takes all the sweep data from the input dataframe as returned by read_hackrf_sweep_file_and_merge
and gets the average db for each bin.
Returns as a pandas Series
:param df: pd.DataFrame from experiment in question
:return: pd.Series of average
"""
return df.mean(axis=0)
def get_max_by_bin(df: pd.DataFrame) -> pd.Series:
"""
Takes all the sweep data from the input dataframe as returned by read_hackrf_sweep_file_and_merge
and gets the max db for each bin.
Returns as a pandas Series
:param df: pd.DataFrame from experiment in question
:return: pd.Series of average
"""
return df.max(axis=0)
def get_train_test_data(positive: List, negative: List, testSize=0.2):
"""
Takes in string or list of string paths to data files for both positive and
negative examples of our hackrf_sweep data and returns a train and test
split of inputs and their labels
:param positive: the list files containing positive examples
:param negative: the list of files containing negative examples
returns a four tuple of x_train, x_test, y_train, y_test
"""
# 1 for positive examples (drones flying) and 0 for just random noise
labels = [1.0, 0.0]
labeled_data = pd.DataFrame()
for files, label in zip([positive, negative], labels):
for filename in files:
new_data = read_hackrf_sweep_file_and_merge(filename)
new_data['label'] = label
labeled_data = labeled_data.append(new_data)
labels = labeled_data['label']
inputs = labeled_data.drop('label', axis=1)
return train_test_split(inputs, labels, test_size=testSize)
def get_heat_map(avg_by_bin):
# Heat Map
# need to rename the '5 meter' file to '5' instead of '05' for this to work
i = 5
avgs_over_distance = avg_by_bin
while i <= 50:
filename_dr = "../../GTRI_drone_data/bothon-01.csv"
filename_bg = "../" % i
sample_dr = read_hackrf_sweep_file_and_merge(filename_dr)
sample_bg = read_hackrf_sweep_file_and_merge(filename_bg)
avg_by_bin_dr = get_mean_by_bin(sample_dr)
avg_by_bin_bg = get_mean_by_bin(sample_bg)
if i is 5:
x = 0
else:
avgs_over_distance_dr = np.append(avgs_over_distance_dr, avg_by_bin_dr)
avgs_over_distance_bg = np.append(avgs_over_distance_bg, avg_by_bin_bg)
i = i + 5
avgs_over_distance_dr = np.reshape(avgs_over_distance, (-1, 180))
avgs_over_distance_bg = np.reshape(avgs_over_distance, (-1, 180))
fig1, ax1 = plt.subplots()
im1 = ax1.imshow(avgs_over_distance_dr)
fig2, ax2 = plt.subplots()
im2 = ax2.imshow(avgs_over_distance_bg)
ax1.set_title("HackRF bins vs Distance to Drone")
plt.savefig('../figures/heatmap_dr.png')
ax2.set_title("HackRF bins vs Background Noise")
plt.savefig('../figures/heatmap_bg.png')
def make_svm(x_train: pd.DataFrame, x_test: pd.DataFrame, y_train: pd.DataFrame, y_test: pd.DataFrame):
clf = SVC(gamma='auto')
return(clf.fit(x_train, y_train))
def plot_learning_curve(estimator: SVC, title, X, y):
plt.figure()
plt.title(title)
plt.xlabel("Training Examples")
plt.ylabel("Score")
train_sizes, train_scores, test_scores = learning_curve(estimator, X, y)
train_scores_mean = np.mean(train_scores, axis = 1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.legend(loc="best")
return plt
def get_scatterplot(filename1, filename2):
# Heat Map
# need to rename the '5 meter' file to '5' instead of '05' for this to work
sample_br = read_hackrf_sweep_file_and_merge(filename1)
sample_dr = read_hackrf_sweep_file_and_merge(filename2)
print(sample_dr)
max_by_bin_br = get_max_by_bin(sample_br)
max_by_bin_dr = get_max_by_bin(sample_dr)
#avgs_over_distance_dr = np.reshape(avgs_over_distance, (-1, 180))
fig1, ax1 = plt.subplots()
print(max_by_bin_dr)
ax1.scatter(max_by_bin_br.index, max_by_bin_br.values, c="b", label="background only")
ax1.scatter(max_by_bin_dr.index, max_by_bin_dr.values, c="r", label="drone on")
plt.legend(loc='upper left')
ax1.set_title("Background vs Drone Activity")
#plt.ylim(-75, -50)
plt.xlabel("Frequency (gHz)")
plt.ylabel("Signal strength (dB)")
plt.savefig('../figures/comparison_scatter_MAX.png')
if __name__ == "__main__":
get_scatterplot('../../GTRI_drone_data/background.csv', '../../GTRI_drone_data/longdistance_withdrone.csv')
'''
if __name__ == "__main__":
drone_filename = "../data/2019.02.15_dji/2019.02.15.10_meters_dji.csv"
noise_filename = "../data/2019.02.15_dji/2019.02.15.bg_after_10_meters_dji.csv"
if __name__ == "__main__":
drone_filename = "../data/2019.02.15_dji/2019.02.15.10_meters_dji.csv"
noise_filename = "../data/2019.02.15_dji/2019.02.15.bg_after_10_meters_dji.csv"
# filename = "../data/25_meters.csv"
print("Working in %s" % os.getcwd())
print("Using file %s" % drone_filename)
print("Example of reading in 10 meter dji data")
print("=======================================")
sample_data = read_hackrf_sweep_file_and_merge(drone_filename)
sample_data.info() # just print out some info about the dataframe
print("---------------------------------------")
print(sample_data.head(5))
print("=======================================")
print("\nExample of getting avg for each bin (pandas Series):")
print("=======================================")
avg_by_bin = get_mean_by_bin(sample_data)
print("Shape: %s" % avg_by_bin.shape)
print("---------------------------------------")
print(avg_by_bin.head(4))
print("\t...")
print(avg_by_bin.tail(4))
print("=======================================")
print("\n Creating a split of our positive and negative examples for use in learning models: ")
x_train, x_test, y_train, y_test = get_train_test_data(positive=[drone_filename], negative=[noise_filename])
print("Training examples: " + str(x_train.head()) + "\nTraining Labels: " + str(y_train.head()))
'''