Spresenseによる人物検出

Spresenseを用いて人物検出を行います。Tensorflow lite/micro のexamples「person_detection」を参照してArduino IDE上で作成します。

Tensorflow をサポートした Arduino IDE構築

「Spresense Arduino Package for Tensorflow」を参考にして、Tensorflow をサポートした Arduino IDEを構築します。

メニュー「ファイル」 →「環境設定」を選択し、「追加のボードマネージャのURL 」に次のURLを追加します。

https://raw.githubusercontent.com/YoshinoTaro/spresense-arduino-tensorflow/main/package_spresense_tensorflow_index.json

メニュー「ツール」 →「ボード」 → 「ボードマネージャ」を選択し、「Spresense Tensorflow Board」を検索して表示されたパッケージを選択してインストールします。

メニュー「ツール」 → 「ボード」 → 「Spresense tensorflow Boards」 → 「Spresense tensorflow」の順に選択します。

人認識モデルの準備

「Tensorflow lite/micro/models 」には人認識モデル「person_detect.tflite」が用意されています。このモデルを次の変換スクリプト「convert_tflite_to_c_header.py」を用いて、ヘッダファイル「person_detect_model.h」に変換します。

convert_tflite_to_c_header.py

import os
import tensorflow as tf
import binascii

def convert_to_c_array(bytes) -> str:
    hexstr = binascii.hexlify(bytes).decode("UTF-8")
    hexstr = hexstr.upper()
    array = ["0x" + hexstr[i:i + 2] for i in range(0, len(hexstr), 2)]
    array = [array[i:i+10] for i in range(0, len(array), 10)]
    return ",\n  ".join([", ".join(e) for e in array])

tflite_binary = open('person_detect.tflite', 'rb').read()
ascii_bytes = convert_to_c_array(tflite_binary)
header_file = "const unsigned char model_tflite[] = {\n  " + ascii_bytes + "\n};\nunsigned int model_tflite_len = " + str(len(tflite_binary)) + ";"
# print(c_file)
with open("person_detect_model.h", "w") as f:
    f.write(header_file)

人物検出スケッチの作成

人物検出スケッチは、Spresenseカメラからの画像を人物検出してLCDに結果を表示します。「SPRESENSEカメラで Tensorflow lite/micro の ”person detection model” をリアルタイムで動かしてみた」を参考にします。

45行目でカメラからの画像を取得し、55行目でYUV422形式からグレースケールに変換して入力に設定します。96px X 960pxが画像サイスにしています
60行目で人物検出を行い、72行目で結果の得点が「60」以上だと人物を検出したとし、LED3を点灯します。
80行目で画像の表示を要求します。
148行目でカメラの画像を160px X 120pxのYUV422形式で取得します。

spresense_tf_person_detect_by_camera.ino

#include <Camera.h>

#include "tensorflow/lite/micro/all_ops_resolver.h"
#include "tensorflow/lite/micro/micro_error_reporter.h"
#include "tensorflow/lite/micro/micro_interpreter.h"
#include "tensorflow/lite/micro/system_setup.h"
#include "tensorflow/lite/schema/schema_generated.h"

#include "person_detect_model.h"

extern void setup_display() ;
extern void disp_image(uint16_t* buf, int offset_x, int offset_y
                       , int target_w, int target_h, int width,bool result);

tflite::ErrorReporter* error_reporter = nullptr;
const tflite::Model* model = nullptr;
tflite::MicroInterpreter* interpreter = nullptr;
TfLiteTensor* input = nullptr;
TfLiteTensor* output = nullptr;
int inference_count = 0;

constexpr int kTensorArenaSize = 100000;
uint8_t tensor_arena[kTensorArenaSize];

/* cropping and scaling parameters */
const int offset_x = 32;
const int offset_y = 12;
const int width    = 160;
const int height   = 120;
const int target_w = 96;
const int target_h = 96;
const int pixfmt   = CAM_IMAGE_PIX_FMT_YUV422;

/* callback function of the camera streaming */
/* the inference process is done in this function */
void CamCB(CamImage img) {
  static uint32_t last_mills = 0;

  if (!img.isAvailable()) {
    Serial.println("img is not available");
    return;
  }

  /* get image data from the frame memory */
  uint16_t* buf = (uint16_t*)img.getImgBuff();
  int n = 0;
  for (int y = offset_y; y < offset_y + target_h; ++y) {
    for (int x = offset_x; x < offset_x + target_w; ++x) {
      /* extracting luminance data from YUV422 data */
      uint16_t value = buf[y * width + x];
      uint16_t y_h = (value & 0xf000) >> 8;
      uint16_t y_l = (value & 0x00f0) >> 4;
      value = (y_h | y_l);  /* luminance data */
      /* set the grayscale data to the input buffer for TensorFlow  */
      input->data.f[n++] = (float)(value) / 255.0;
    }
  }

  Serial.println("Do inference");
  TfLiteStatus invoke_status = interpreter->Invoke();
  if (invoke_status != kTfLiteOk) {
    Serial.println("Invoke failed");
    return;
  }

  /* get the result */
  bool result = false;
  int8_t person_score = output->data.uint8[1];
  int8_t no_person_score = output->data.uint8[0];
  Serial.print("Person = " + String(person_score) + ", ");
  Serial.println("No_person = " + String(no_person_score));
  if ((person_score > no_person_score) && (person_score > 60)) {
    digitalWrite(LED3, HIGH);
    result = true;
  } else {
    digitalWrite(LED3, LOW);
  }

  /* display the captured data */
  disp_image(buf, offset_x, offset_y, target_w, target_h,width, result);

  uint32_t current_mills = millis();
  uint32_t duration = current_mills - last_mills;
  Serial.println("duration = " + String(duration));
  last_mills = current_mills;
}


void setup() {
  Serial.begin(115200);
  setup_display();

  tflite::InitializeTarget();
  memset(tensor_arena, 0, kTensorArenaSize * sizeof(uint8_t));

  // Set up logging.
  static tflite::MicroErrorReporter micro_error_reporter;
  error_reporter = &micro_error_reporter;

  // Map the model into a usable data structure..
  model = tflite::GetModel(model_tflite);
  if (model->version() != TFLITE_SCHEMA_VERSION) {
    Serial.println("Model provided is schema version "
                   + String(model->version()) + " not equal "
                   + "to supported version "
                   + String(TFLITE_SCHEMA_VERSION));
    return;
  } else {
    Serial.println("Model version: " + String(model->version()));
  }
  // This pulls in all the operation implementations we need.
  static tflite::AllOpsResolver resolver;

  // Build an interpreter to run the model with.
  static tflite::MicroInterpreter static_interpreter(
    model, resolver, tensor_arena, kTensorArenaSize, error_reporter);
  interpreter = &static_interpreter;

  // Allocate memory from the tensor_arena for the model's tensors.
  TfLiteStatus allocate_status = interpreter->AllocateTensors();
  if (allocate_status != kTfLiteOk) {
    Serial.println("AllocateTensors() failed");
    return;
  } else {
    Serial.println("AllocateTensor() Success");
  }

  size_t used_size = interpreter->arena_used_bytes();
  Serial.println("Area used bytes: " + String(used_size));
  input = interpreter->input(0);
  output = interpreter->output(0);

  Serial.println("Model input:");
  Serial.println("dims->size: " + String(input->dims->size));
  for (int n = 0; n < input->dims->size; ++n) {
    Serial.println("dims->data[" + String(n) + "]: " + String(input->dims->data[n]));
  }

  Serial.println("Model output:");
  Serial.println("dims->size: " + String(output->dims->size));
  for (int n = 0; n < output->dims->size; ++n) {
    Serial.println("dims->data[" + String(n) + "]: " + String(output->dims->data[n]));
  }

  Serial.println("Completed tensorflow setup");
  digitalWrite(LED0, HIGH);

  CamErr err = theCamera.begin(1, CAM_VIDEO_FPS_15, width, height, pixfmt);
  if (err != CAM_ERR_SUCCESS) {
    Serial.println("camera begin err: " + String(err));
    return;
  }
  err = theCamera.startStreaming(true, CamCB);
  if (err != CAM_ERR_SUCCESS) {
    Serial.println("start streaming err: " + String(err));
    return;
  }
}

void loop() {
}

59行目のinvert関数で画像データを反転させます。
54行目で人物検出すると赤のボックスを設定し、61行目でグレースケールに変換した96px X 96pxの画像をLCDの中央に表示します。

display.cpp

#include "Adafruit_GFX.h"
#include "Adafruit_ST7789.h"

#define TFT_CS        10
#define TFT_RST        9 // Or set to -1 and connect to Arduino RESET pin
#define TFT_DC         8

const int offset_x = 32;
const int offset_y = 12;
const int width    = 160;
const int height   = 120;
const int target_w = 96;
const int target_h = 96;

/* indicator box */
int box_sx = 80;
int box_ex = 90;
int box_sy = 5;
int box_ey = 15;


Adafruit_ST7789 tft = Adafruit_ST7789(TFT_CS , TFT_DC , TFT_RST);
uint16_t disp[target_w * target_h];

void setup_display() {
  tft.init(240, 320);
  tft.setRotation(3);
  tft.fillScreen(ST77XX_BLACK);
}
void invert(uint16_t *databuf, int count)
{
  for (int i = 0; i < count; i ++) {
    //Serial.print(*( databuf + i)); Serial.print(" ");
    *( databuf + i) =  0xffff - *( databuf + i);
    //Serial.println(*( databuf + i));
  }
}

void disp_image(uint16_t* buf, int offset_x, int offset_y
                , int target_w, int target_h, int width, bool result) {
  int n = 0;
  for (int y = offset_y; y < offset_y + target_h; ++y) {
    for (int x = offset_x; x < offset_x + target_w; ++x) {
      uint16_t value = buf[y * width + x];
      uint16_t y_h = (value & 0xf000) >> 8;
      uint16_t y_l = (value & 0x00f0) >> 4;
      value = (y_h | y_l);
      uint16_t value6 = (value >> 2);
      uint16_t value5 = (value >> 3);
      disp[n] = (value5 << 11) | (value6 << 5) | value5;
      if (result && (y >= (offset_y + box_sy)) && (y <= (offset_y + box_ey))
          && (x >= (offset_x + box_sx)) && (x <= (offset_x + box_ex))) {
        //disp[n] = ILI9341_RED;
        disp[n] = 0xF800;
      }
      ++n;
    }
  }
  invert(disp, target_w * target_h);
  //tft.drawRGBBitmap(0, 0, disp, target_w, target_h);
  tft.drawRGBBitmap(112, 72, disp, target_w, target_h);
}

人物検出スケッチの実行

人物検出スケッチを実行する前に、デフォルトだとメモリが足りないので、次のように「Main Core」のメモリ設定を「1152kB」に変更します。

実行して次のように人物を検出すると赤いボックスが表示されます。

シリアルモニタには次のようにメッセージが表示されます。

Model version: 3
AllocateTensor() Success
Area used bytes: 82300
Model input:
dims->size: 4
dims->data[0]: 1
dims->data[1]: 96
dims->data[2]: 96
dims->data[3]: 1
Model output:
dims->size: 2
dims->data[0]: 1
dims->data[1]: 2
Completed tensorflow setup
Do inference
Person = -25, No_person = 25
duration = 5919
Do inference
Person = -43, No_person = 43
duration = 593
Do inference
Person = -49, No_person = 49
duration = 592
Do inference
Person = -47, No_person = 47
duration = 593
Do inference
Person = -59, No_person = 59
duration = 593
Do inference