-
Notifications
You must be signed in to change notification settings - Fork 201
/
Copy pathdemo.cpp
304 lines (248 loc) · 9.91 KB
/
demo.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
#include "opencv2/opencv.hpp"
#include <map>
#include <vector>
#include <string>
#include <iostream>
using namespace std;
using namespace cv;
using namespace dnn;
std::vector<std::pair<int, int>> backend_target_pairs = {
{DNN_BACKEND_OPENCV, DNN_TARGET_CPU},
{DNN_BACKEND_CUDA, DNN_TARGET_CUDA},
{DNN_BACKEND_CUDA, DNN_TARGET_CUDA_FP16},
{DNN_BACKEND_TIMVX, DNN_TARGET_NPU},
{DNN_BACKEND_CANN, DNN_TARGET_NPU}
};
class FER
{
private:
Net model;
string modelPath;
float std[5][2] = {
{38.2946, 51.6963},
{73.5318, 51.5014},
{56.0252, 71.7366},
{41.5493, 92.3655},
{70.7299, 92.2041}
};
vector<String> expressionEnum = {
"angry", "disgust", "fearful",
"happy", "neutral", "sad", "surprised"
};
Mat stdPoints = Mat(5, 2, CV_32F, this->std);
Size patchSize = Size(112,112);
Scalar imageMean = Scalar(0.5,0.5,0.5);
Scalar imageStd = Scalar(0.5,0.5,0.5);
const String inputNames = "data";
const String outputNames = "label";
int backend_id;
int target_id;
public:
FER(const string& modelPath,
int backend_id = 0,
int target_id = 0)
: modelPath(modelPath), backend_id(backend_id), target_id(target_id)
{
this->model = readNet(modelPath);
this->model.setPreferableBackend(backend_id);
this->model.setPreferableTarget(target_id);
}
Mat preprocess(const Mat image, const Mat points)
{
// image alignment
Mat transformation = estimateAffine2D(points, this->stdPoints);
Mat aligned = Mat::zeros(this->patchSize.height, this->patchSize.width, image.type());
warpAffine(image, aligned, transformation, this->patchSize);
// image normalization
aligned.convertTo(aligned, CV_32F, 1.0 / 255.0);
aligned -= imageMean;
aligned /= imageStd;
return blobFromImage(aligned);;
}
String infer(const Mat image, const Mat facePoints)
{
Mat points = facePoints(Rect(4, 0, facePoints.cols-5, facePoints.rows)).reshape(2, 5);
Mat inputBlob = preprocess(image, points);
this->model.setInput(inputBlob, this->inputNames);
Mat outputBlob = this->model.forward(this->outputNames);
Point maxLoc;
minMaxLoc(outputBlob, nullptr, nullptr, nullptr, &maxLoc);
return getDesc(maxLoc.x);
}
String getDesc(int ind)
{
if (ind >= 0 && ind < this->expressionEnum.size())
{
return this->expressionEnum[ind];
}
else
{
cerr << "Error: Index out of bounds." << endl;
return "";
}
}
};
class YuNet
{
public:
YuNet(const string& model_path,
const Size& input_size = Size(320, 320),
float conf_threshold = 0.6f,
float nms_threshold = 0.3f,
int top_k = 5000,
int backend_id = 0,
int target_id = 0)
: model_path_(model_path), input_size_(input_size),
conf_threshold_(conf_threshold), nms_threshold_(nms_threshold),
top_k_(top_k), backend_id_(backend_id), target_id_(target_id)
{
model = FaceDetectorYN::create(model_path_, "", input_size_, conf_threshold_, nms_threshold_, top_k_, backend_id_, target_id_);
}
void setBackendAndTarget(int backend_id, int target_id)
{
backend_id_ = backend_id;
target_id_ = target_id;
model = FaceDetectorYN::create(model_path_, "", input_size_, conf_threshold_, nms_threshold_, top_k_, backend_id_, target_id_);
}
/* Overwrite the input size when creating the model. Size format: [Width, Height].
*/
void setInputSize(const Size& input_size)
{
input_size_ = input_size;
model->setInputSize(input_size_);
}
Mat infer(const Mat image)
{
Mat res;
model->detect(image, res);
return res;
}
private:
Ptr<FaceDetectorYN> model;
string model_path_;
Size input_size_;
float conf_threshold_;
float nms_threshold_;
int top_k_;
int backend_id_;
int target_id_;
};
cv::Mat visualize(const cv::Mat& image, const cv::Mat& faces, const vector<String> expressions, float fps = -1.f)
{
static cv::Scalar box_color{0, 255, 0};
static std::vector<cv::Scalar> landmark_color{
cv::Scalar(255, 0, 0), // right eye
cv::Scalar( 0, 0, 255), // left eye
cv::Scalar( 0, 255, 0), // nose tip
cv::Scalar(255, 0, 255), // right mouth corner
cv::Scalar( 0, 255, 255) // left mouth corner
};
static cv::Scalar text_color{0, 255, 0};
auto output_image = image.clone();
if (fps >= 0)
{
cv::putText(output_image, cv::format("FPS: %.2f", fps), cv::Point(0, 15), cv::FONT_HERSHEY_SIMPLEX, 0.5, text_color, 2);
}
for (int i = 0; i < faces.rows; ++i)
{
// Draw bounding boxes
int x1 = static_cast<int>(faces.at<float>(i, 0));
int y1 = static_cast<int>(faces.at<float>(i, 1));
int w = static_cast<int>(faces.at<float>(i, 2));
int h = static_cast<int>(faces.at<float>(i, 3));
cv::rectangle(output_image, cv::Rect(x1, y1, w, h), box_color, 2);
// Expression as text
String exp = expressions[i];
cv::putText(output_image, exp, cv::Point(x1, y1+12), cv::FONT_HERSHEY_DUPLEX, 0.5, text_color);
// Draw landmarks
for (int j = 0; j < landmark_color.size(); ++j)
{
int x = static_cast<int>(faces.at<float>(i, 2*j+4)), y = static_cast<int>(faces.at<float>(i, 2*j+5));
cv::circle(output_image, cv::Point(x, y), 2, landmark_color[j], 2);
}
}
return output_image;
}
string keys =
"{ help h | | Print help message. }"
"{ model m | facial_expression_recognition_mobilefacenet_2022july.onnx | Usage: Path to the model, defaults to facial_expression_recognition_mobilefacenet_2022july.onnx }"
"{ yunet_model ym | ../face_detection_yunet/face_detection_yunet_2023mar.onnx | Usage: Path to the face detection yunet model, defaults to face_detection_yunet_2023mar.onnx }"
"{ input i | | Path to input image or video file. Skip this argument to capture frames from a camera.}"
"{ backend_target t | 0 | Choose one of the backend-target pair to run this demo:\n"
"0: (default) OpenCV implementation + CPU,\n"
"1: CUDA + GPU (CUDA),\n"
"2: CUDA + GPU (CUDA FP16),\n"
"3: TIM-VX + NPU,\n"
"4: CANN + NPU}"
"{ save s | false | Specify to save results.}"
"{ vis v | true | Specify to open a window for result visualization.}"
;
int main(int argc, char** argv)
{
CommandLineParser parser(argc, argv, keys);
parser.about("Facial Expression Recognition");
if (parser.has("help"))
{
parser.printMessage();
return 0;
}
string modelPath = parser.get<string>("model");
string yunetModelPath = parser.get<string>("yunet_model");
string inputPath = parser.get<string>("input");
uint8_t backendTarget = parser.get<uint8_t>("backend_target");
bool saveFlag = parser.get<bool>("save");
bool visFlag = parser.get<bool>("vis");
if (modelPath.empty())
CV_Error(Error::StsError, "Model file " + modelPath + " not found");
if (yunetModelPath.empty())
CV_Error(Error::StsError, "Face Detection Model file " + yunetModelPath + " not found");
YuNet faceDetectionModel(yunetModelPath);
FER expressionRecognitionModel(modelPath, backend_target_pairs[backendTarget].first, backend_target_pairs[backendTarget].second);
VideoCapture cap;
if (!inputPath.empty())
cap.open(samples::findFile(inputPath));
else
cap.open(0);
if (!cap.isOpened())
CV_Error(Error::StsError, "Cannot opend video or file");
Mat frame;
static const std::string kWinName = "Facial Expression Demo";
while (waitKey(1) < 0)
{
cap >> frame;
if (frame.empty())
{
if(inputPath.empty())
cout << "Frame is empty" << endl;
break;
}
faceDetectionModel.setInputSize(frame.size());
Mat faces = faceDetectionModel.infer(frame);
vector<String> expressions;
for (int i = 0; i < faces.rows; ++i)
{
Mat face = faces.row(i);
String exp = expressionRecognitionModel.infer(frame, face);
expressions.push_back(exp);
int x1 = static_cast<int>(faces.at<float>(i, 0));
int y1 = static_cast<int>(faces.at<float>(i, 1));
int w = static_cast<int>(faces.at<float>(i, 2));
int h = static_cast<int>(faces.at<float>(i, 3));
float conf = faces.at<float>(i, 14);
std::cout << cv::format("%d: x1=%d, y1=%d, w=%d, h=%d, conf=%.4f expression=%s\n", i, x1, y1, w, h, conf, exp.c_str());
}
Mat res_frame = visualize(frame, faces, expressions);
if(visFlag || inputPath.empty())
{
imshow(kWinName, res_frame);
if(!inputPath.empty())
waitKey(0);
}
if(saveFlag)
{
cout << "Results are saved to result.jpg" << endl;
cv::imwrite("result.jpg", res_frame);
}
}
return 0;
}