postprocess.cpp 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555
  1. // Copyright (c) 2021 by Rockchip Electronics Co., Ltd. All Rights Reserved.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. #include <math.h>
  15. #include <stdint.h>
  16. #include <stdio.h>
  17. #include <stdlib.h>
  18. #include <string.h>
  19. #include <sys/time.h>
  20. #include "postprocess.h"
  21. #include <set>
  22. #include <vector>
  23. inline static int clamp(float val, int min, int max) { return val > min ? (val < max ? val : max) : min; }
  24. static char *readLine(FILE *fp, char *buffer, int *len)
  25. {
  26. int ch;
  27. int i = 0;
  28. size_t buff_len = 0;
  29. buffer = (char *)malloc(buff_len + 1);
  30. if (!buffer)
  31. return NULL; // Out of memory
  32. while ((ch = fgetc(fp)) != '\n' && ch != EOF)
  33. {
  34. buff_len++;
  35. void *tmp = realloc(buffer, buff_len + 1);
  36. if (tmp == NULL)
  37. {
  38. free(buffer);
  39. return NULL; // Out of memory
  40. }
  41. buffer = (char *)tmp;
  42. buffer[i] = (char)ch;
  43. i++;
  44. }
  45. buffer[i] = '\0';
  46. *len = buff_len;
  47. // Detect end
  48. if (ch == EOF && (i == 0 || ferror(fp)))
  49. {
  50. free(buffer);
  51. return NULL;
  52. }
  53. return buffer;
  54. }
  55. static int readLines(const char *fileName, char *lines[], int max_line)
  56. {
  57. FILE *file = fopen(fileName, "r");
  58. char *s;
  59. int i = 0;
  60. int n = 0;
  61. if (file == NULL)
  62. {
  63. printf("Open %s fail!\n", fileName);
  64. return -1;
  65. }
  66. while ((s = readLine(file, s, &n)) != NULL)
  67. {
  68. lines[i++] = s;
  69. if (i >= max_line)
  70. break;
  71. }
  72. fclose(file);
  73. return i;
  74. }
  75. static float CalculateOverlap(float xmin0, float ymin0, float xmax0, float ymax0, float xmin1, float ymin1, float xmax1,
  76. float ymax1)
  77. {
  78. float w = fmax(0.f, fmin(xmax0, xmax1) - fmax(xmin0, xmin1) + 1.0);
  79. float h = fmax(0.f, fmin(ymax0, ymax1) - fmax(ymin0, ymin1) + 1.0);
  80. float i = w * h;
  81. float u = (xmax0 - xmin0 + 1.0) * (ymax0 - ymin0 + 1.0) + (xmax1 - xmin1 + 1.0) * (ymax1 - ymin1 + 1.0) - i;
  82. return u <= 0.f ? 0.f : (i / u);
  83. }
  84. static int nms(int validCount, std::vector<float> &outputLocations, std::vector<int> classIds, std::vector<int> &order,
  85. int filterId, float threshold)
  86. {
  87. for (int i = 0; i < validCount; ++i)
  88. {
  89. if (order[i] == -1 || classIds[i] != filterId)
  90. {
  91. continue;
  92. }
  93. int n = order[i];
  94. for (int j = i + 1; j < validCount; ++j)
  95. {
  96. int m = order[j];
  97. if (m == -1 || classIds[i] != filterId)
  98. {
  99. continue;
  100. }
  101. float xmin0 = outputLocations[n * 4 + 0];
  102. float ymin0 = outputLocations[n * 4 + 1];
  103. float xmax0 = outputLocations[n * 4 + 0] + outputLocations[n * 4 + 2];
  104. float ymax0 = outputLocations[n * 4 + 1] + outputLocations[n * 4 + 3];
  105. float xmin1 = outputLocations[m * 4 + 0];
  106. float ymin1 = outputLocations[m * 4 + 1];
  107. float xmax1 = outputLocations[m * 4 + 0] + outputLocations[m * 4 + 2];
  108. float ymax1 = outputLocations[m * 4 + 1] + outputLocations[m * 4 + 3];
  109. float iou = CalculateOverlap(xmin0, ymin0, xmax0, ymax0, xmin1, ymin1, xmax1, ymax1);
  110. if (iou > threshold)
  111. {
  112. order[j] = -1;
  113. }
  114. }
  115. }
  116. return 0;
  117. }
  118. static int quick_sort_indice_inverse(std::vector<float> &input, int left, int right, std::vector<int> &indices)
  119. {
  120. float key;
  121. int key_index;
  122. int low = left;
  123. int high = right;
  124. if (left < right)
  125. {
  126. key_index = indices[left];
  127. key = input[left];
  128. while (low < high)
  129. {
  130. while (low < high && input[high] <= key)
  131. {
  132. high--;
  133. }
  134. input[low] = input[high];
  135. indices[low] = indices[high];
  136. while (low < high && input[low] >= key)
  137. {
  138. low++;
  139. }
  140. input[high] = input[low];
  141. indices[high] = indices[low];
  142. }
  143. input[low] = key;
  144. indices[low] = key_index;
  145. quick_sort_indice_inverse(input, left, low - 1, indices);
  146. quick_sort_indice_inverse(input, low + 1, right, indices);
  147. }
  148. return low;
  149. }
  150. static float sigmoid(float x) { return 1.0 / (1.0 + expf(-x)); }
  151. static float unsigmoid(float y) { return -1.0 * logf((1.0 / y) - 1.0); }
  152. inline static int32_t __clip(float val, float min, float max)
  153. {
  154. float f = val <= min ? min : (val >= max ? max : val);
  155. return f;
  156. }
  157. static int8_t qnt_f32_to_affine(float f32, int32_t zp, float scale)
  158. {
  159. float dst_val = (f32 / scale) + zp;
  160. int8_t res = (int8_t)__clip(dst_val, -128, 127);
  161. return res;
  162. }
  163. static uint8_t qnt_f32_to_affine_u8(float f32, int32_t zp, float scale)
  164. {
  165. float dst_val = (f32 / scale) + zp;
  166. uint8_t res = (uint8_t)__clip(dst_val, 0, 255);
  167. return res;
  168. }
  169. static float deqnt_affine_to_f32(int8_t qnt, int32_t zp, float scale) { return ((float)qnt - (float)zp) * scale; }
  170. static float deqnt_affine_u8_to_f32(uint8_t qnt, int32_t zp, float scale) { return ((float)qnt - (float)zp) * scale; }
  171. static void compute_dfl(float *tensor, int dfl_len, float *box)
  172. {
  173. for (int b = 0; b < 4; b++)
  174. {
  175. float exp_t[dfl_len];
  176. float exp_sum = 0;
  177. float acc_sum = 0;
  178. for (int i = 0; i < dfl_len; i++)
  179. {
  180. exp_t[i] = exp(tensor[i + b * dfl_len]);
  181. exp_sum += exp_t[i];
  182. }
  183. for (int i = 0; i < dfl_len; i++)
  184. {
  185. acc_sum += exp_t[i] / exp_sum * i;
  186. }
  187. box[b] = acc_sum;
  188. }
  189. }
  190. static int process_u8(uint8_t *box_tensor, int32_t box_zp, float box_scale,
  191. uint8_t *score_tensor, int32_t score_zp, float score_scale,
  192. uint8_t *score_sum_tensor, int32_t score_sum_zp, float score_sum_scale,
  193. int grid_h, int grid_w, int stride, int dfl_len,
  194. std::vector<float> &boxes,
  195. std::vector<float> &objProbs,
  196. std::vector<int> &classId,
  197. float threshold, int OBJ_CLASS_NUM)
  198. {
  199. int validCount = 0;
  200. int grid_len = grid_h * grid_w;
  201. uint8_t score_thres_u8 = qnt_f32_to_affine_u8(threshold, score_zp, score_scale);
  202. uint8_t score_sum_thres_u8 = qnt_f32_to_affine_u8(threshold, score_sum_zp, score_sum_scale);
  203. for (int i = 0; i < grid_h; i++)
  204. {
  205. for (int j = 0; j < grid_w; j++)
  206. {
  207. int offset = i * grid_w + j;
  208. int max_class_id = -1;
  209. // Use score sum to quickly filter
  210. if (score_sum_tensor != nullptr)
  211. {
  212. if (score_sum_tensor[offset] < score_sum_thres_u8)
  213. {
  214. continue;
  215. }
  216. }
  217. uint8_t max_score = -score_zp;
  218. for (int c = 0; c < OBJ_CLASS_NUM; c++)
  219. {
  220. if ((score_tensor[offset] > score_thres_u8) && (score_tensor[offset] > max_score))
  221. {
  222. max_score = score_tensor[offset];
  223. max_class_id = c;
  224. }
  225. offset += grid_len;
  226. }
  227. // compute box
  228. if (max_score > score_thres_u8)
  229. {
  230. offset = i * grid_w + j;
  231. float box[4];
  232. float before_dfl[dfl_len * 4];
  233. for (int k = 0; k < dfl_len * 4; k++)
  234. {
  235. before_dfl[k] = deqnt_affine_u8_to_f32(box_tensor[offset], box_zp, box_scale);
  236. offset += grid_len;
  237. }
  238. compute_dfl(before_dfl, dfl_len, box);
  239. float x1, y1, x2, y2, w, h;
  240. x1 = (-box[0] + j + 0.5) * stride;
  241. y1 = (-box[1] + i + 0.5) * stride;
  242. x2 = (box[2] + j + 0.5) * stride;
  243. y2 = (box[3] + i + 0.5) * stride;
  244. w = x2 - x1;
  245. h = y2 - y1;
  246. boxes.push_back(x1);
  247. boxes.push_back(y1);
  248. boxes.push_back(w);
  249. boxes.push_back(h);
  250. objProbs.push_back(deqnt_affine_u8_to_f32(max_score, score_zp, score_scale));
  251. classId.push_back(max_class_id);
  252. validCount++;
  253. }
  254. }
  255. }
  256. return validCount;
  257. }
  258. static int process_i8(int8_t *box_tensor, int32_t box_zp, float box_scale,
  259. int8_t *score_tensor, int32_t score_zp, float score_scale,
  260. int8_t *score_sum_tensor, int32_t score_sum_zp, float score_sum_scale,
  261. int grid_h, int grid_w, int stride, int dfl_len,
  262. std::vector<float> &boxes,
  263. std::vector<float> &objProbs,
  264. std::vector<int> &classId,
  265. float threshold, int OBJ_CLASS_NUM)
  266. {
  267. int validCount = 0;
  268. int grid_len = grid_h * grid_w;
  269. int8_t score_thres_i8 = qnt_f32_to_affine(threshold, score_zp, score_scale);
  270. int8_t score_sum_thres_i8 = qnt_f32_to_affine(threshold, score_sum_zp, score_sum_scale);
  271. for (int i = 0; i < grid_h; i++)
  272. {
  273. for (int j = 0; j < grid_w; j++)
  274. {
  275. int offset = i * grid_w + j;
  276. int max_class_id = -1;
  277. // 通过 score sum 起到快速过滤的作用
  278. if (score_sum_tensor != nullptr)
  279. {
  280. if (score_sum_tensor[offset] < score_sum_thres_i8)
  281. {
  282. continue;
  283. }
  284. }
  285. int8_t max_score = -score_zp;
  286. for (int c = 0; c < OBJ_CLASS_NUM; c++)
  287. {
  288. if ((score_tensor[offset] > score_thres_i8) && (score_tensor[offset] > max_score))
  289. {
  290. max_score = score_tensor[offset];
  291. max_class_id = c;
  292. }
  293. offset += grid_len;
  294. }
  295. // compute box
  296. if (max_score > score_thres_i8)
  297. {
  298. offset = i * grid_w + j;
  299. float box[4];
  300. float before_dfl[dfl_len * 4];
  301. for (int k = 0; k < dfl_len * 4; k++)
  302. {
  303. before_dfl[k] = deqnt_affine_to_f32(box_tensor[offset], box_zp, box_scale);
  304. offset += grid_len;
  305. }
  306. compute_dfl(before_dfl, dfl_len, box);
  307. float x1, y1, x2, y2, w, h;
  308. x1 = (-box[0] + j + 0.5) * stride;
  309. y1 = (-box[1] + i + 0.5) * stride;
  310. x2 = (box[2] + j + 0.5) * stride;
  311. y2 = (box[3] + i + 0.5) * stride;
  312. w = x2 - x1;
  313. h = y2 - y1;
  314. boxes.push_back(x1);
  315. boxes.push_back(y1);
  316. boxes.push_back(w);
  317. boxes.push_back(h);
  318. objProbs.push_back(deqnt_affine_to_f32(max_score, score_zp, score_scale));
  319. classId.push_back(max_class_id);
  320. validCount++;
  321. }
  322. }
  323. }
  324. return validCount;
  325. }
  326. static int process_fp32(float *box_tensor, float *score_tensor, float *score_sum_tensor,
  327. int grid_h, int grid_w, int stride, int dfl_len,
  328. std::vector<float> &boxes,
  329. std::vector<float> &objProbs,
  330. std::vector<int> &classId,
  331. float threshold, int OBJ_CLASS_NUM)
  332. {
  333. int validCount = 0;
  334. int grid_len = grid_h * grid_w;
  335. for (int i = 0; i < grid_h; i++)
  336. {
  337. for (int j = 0; j < grid_w; j++)
  338. {
  339. int offset = i * grid_w + j;
  340. int max_class_id = -1;
  341. // 通过 score sum 起到快速过滤的作用
  342. if (score_sum_tensor != nullptr)
  343. {
  344. if (score_sum_tensor[offset] < threshold)
  345. {
  346. continue;
  347. }
  348. }
  349. float max_score = 0;
  350. for (int c = 0; c < OBJ_CLASS_NUM; c++)
  351. {
  352. if ((score_tensor[offset] > threshold) && (score_tensor[offset] > max_score))
  353. {
  354. max_score = score_tensor[offset];
  355. max_class_id = c;
  356. }
  357. offset += grid_len;
  358. }
  359. // compute box
  360. if (max_score > threshold)
  361. {
  362. offset = i * grid_w + j;
  363. float box[4];
  364. float before_dfl[dfl_len * 4];
  365. for (int k = 0; k < dfl_len * 4; k++)
  366. {
  367. before_dfl[k] = box_tensor[offset];
  368. offset += grid_len;
  369. }
  370. compute_dfl(before_dfl, dfl_len, box);
  371. float x1, y1, x2, y2, w, h;
  372. x1 = (-box[0] + j + 0.5) * stride;
  373. y1 = (-box[1] + i + 0.5) * stride;
  374. x2 = (box[2] + j + 0.5) * stride;
  375. y2 = (box[3] + i + 0.5) * stride;
  376. w = x2 - x1;
  377. h = y2 - y1;
  378. boxes.push_back(x1);
  379. boxes.push_back(y1);
  380. boxes.push_back(w);
  381. boxes.push_back(h);
  382. objProbs.push_back(max_score);
  383. classId.push_back(max_class_id);
  384. validCount++;
  385. }
  386. }
  387. }
  388. return validCount;
  389. }
  390. int post_process(PPYOLOE *app_ctx, rknn_output *outputs, float conf_threshold, float nms_threshold, object_detect_result_list *od_results, int OBJ_CLASS_NUM)
  391. {
  392. std::vector<float> filterBoxes;
  393. std::vector<float> objProbs;
  394. std::vector<int> classId;
  395. int validCount = 0;
  396. int stride = 0;
  397. int grid_h = 0;
  398. int grid_w = 0;
  399. int model_in_w = app_ctx->width;
  400. int model_in_h = app_ctx->height;
  401. memset(od_results, 0, sizeof(object_detect_result_list));
  402. // default 3 branch
  403. #ifdef RKNPU1
  404. int dfl_len = app_ctx->output_attrs[0].dims[2] / 4;
  405. #else
  406. int dfl_len = app_ctx->output_attrs[0].dims[1] / 4;
  407. #endif
  408. int output_per_branch = app_ctx->io_num.n_output / 3;
  409. for (int i = 0; i < 3; i++)
  410. {
  411. void *score_sum = nullptr;
  412. int32_t score_sum_zp = 0;
  413. float score_sum_scale = 1.0;
  414. if (output_per_branch == 3)
  415. {
  416. score_sum = outputs[i * output_per_branch + 2].buf;
  417. score_sum_zp = app_ctx->output_attrs[i * output_per_branch + 2].zp;
  418. score_sum_scale = app_ctx->output_attrs[i * output_per_branch + 2].scale;
  419. }
  420. int box_idx = i * output_per_branch;
  421. int score_idx = i * output_per_branch + 1;
  422. #ifdef RKNPU1
  423. grid_h = app_ctx->output_attrs[box_idx].dims[1];
  424. grid_w = app_ctx->output_attrs[box_idx].dims[0];
  425. #else
  426. grid_h = app_ctx->output_attrs[box_idx].dims[2];
  427. grid_w = app_ctx->output_attrs[box_idx].dims[3];
  428. #endif
  429. stride = model_in_h / grid_h;
  430. if (app_ctx->is_quant)
  431. {
  432. #ifdef RKNPU1
  433. validCount += process_u8((uint8_t *)outputs[box_idx].buf, app_ctx->output_attrs[box_idx].zp, app_ctx->output_attrs[box_idx].scale,
  434. (uint8_t *)outputs[score_idx].buf, app_ctx->output_attrs[score_idx].zp, app_ctx->output_attrs[score_idx].scale,
  435. (uint8_t *)score_sum, score_sum_zp, score_sum_scale,
  436. grid_h, grid_w, stride, dfl_len,
  437. filterBoxes, objProbs, classId, conf_threshold);
  438. #else
  439. validCount += process_i8((int8_t *)outputs[box_idx].buf, app_ctx->output_attrs[box_idx].zp, app_ctx->output_attrs[box_idx].scale,
  440. (int8_t *)outputs[score_idx].buf, app_ctx->output_attrs[score_idx].zp, app_ctx->output_attrs[score_idx].scale,
  441. (int8_t *)score_sum, score_sum_zp, score_sum_scale,
  442. grid_h, grid_w, stride, dfl_len,
  443. filterBoxes, objProbs, classId, conf_threshold, OBJ_CLASS_NUM);
  444. #endif
  445. }
  446. else
  447. {
  448. validCount += process_fp32((float *)outputs[box_idx].buf, (float *)outputs[score_idx].buf, (float *)score_sum,
  449. grid_h, grid_w, stride, dfl_len,
  450. filterBoxes, objProbs, classId, conf_threshold, OBJ_CLASS_NUM);
  451. }
  452. }
  453. // no object detect
  454. if (validCount <= 0)
  455. {
  456. return 0;
  457. }
  458. std::vector<int> indexArray;
  459. for (int i = 0; i < validCount; ++i)
  460. {
  461. indexArray.push_back(i);
  462. }
  463. quick_sort_indice_inverse(objProbs, 0, validCount - 1, indexArray);
  464. std::set<int> class_set(std::begin(classId), std::end(classId));
  465. for (auto c : class_set)
  466. {
  467. nms(validCount, filterBoxes, classId, indexArray, c, nms_threshold);
  468. }
  469. int last_count = 0;
  470. od_results->count = 0;
  471. /* box valid detect target */
  472. for (int i = 0; i < validCount; ++i)
  473. {
  474. if (indexArray[i] == -1 || last_count >= 128)
  475. {
  476. continue;
  477. }
  478. int n = indexArray[i];
  479. float x1 = filterBoxes[n * 4 + 0];
  480. float y1 = filterBoxes[n * 4 + 1];
  481. float x2 = x1 + filterBoxes[n * 4 + 2];
  482. float y2 = y1 + filterBoxes[n * 4 + 3];
  483. int id = classId[n];
  484. float obj_conf = objProbs[i];
  485. od_results->results[last_count].box.left = (int)(clamp(x1, 0, model_in_w));
  486. od_results->results[last_count].box.top = (int)(clamp(y1, 0, model_in_h));
  487. od_results->results[last_count].box.right = (int)(clamp(x2, 0, model_in_w));
  488. od_results->results[last_count].box.bottom = (int)(clamp(y2, 0, model_in_h));
  489. od_results->results[last_count].prop = obj_conf;
  490. od_results->results[last_count].cls_id = id;
  491. last_count++;
  492. }
  493. od_results->count = last_count;
  494. return 0;
  495. }