嘗試 MediaPipe + Unity 實作手勢辨識與互動 Part 1
這次想要體驗使用 Google 推出的跨平台 ML 框架 MediaPipe 來玩點小東西, 直接使用了已完美整合 MediaPipe 的 MediaPipeUnityPlugin 來進行手勢辨識初體驗,首先將 repos clone 下來後,要進行一連串的安裝與建置的步驟 🥵(見 Installation Guide),並安裝適當的 Unity 版本,這次使用的是:
- Unity 2021.1.7f1
- MacBookPro 13-inch 2019 (MacOS 10.15.4)
這邊使用的 cpu 的建置版本:
build.py build --desktop cpu -v
建制成功後應該會看到類似以下訊息(感動):
之後便可以打開 MediaPipeUnityPlugin 專案執行展示場景:(Assets/Mediapipe/Samples/Scenes/DesktopDemo.scene)
如果可以順利看到 webcam 畫面與預設的 Face Detection 效果,那恭喜你到這邊準備工作算是順利完成了 😂
架構
接著我們要來應用 MediaPipeUnityPlugin 提供的腳本與架構來撰寫自己的場景,預計使用到的類別有:
class GestureTrackingDirector : MonoBehaviour { }
class GestureTrackingGraph : HandTrackingGraph { }
class HandState { }
class GestureAnalyzer { }
GestureTrackingDirector 是我們的 root controller,負責透過 WebCamScreenController 獲取 webcam 影像再傳遞給 GestureTrackingGraph 分析,之後我們將分析出的手部骨骼資訊轉換成 HandState,再餵給 GestureAnalyzer 判斷手勢。
場景物件配置如下:
其中 WebCamScreen 直接取用 Desktop.scene 中的 WebCamScreen GameObject 就可以了,這邊只是為了要能夠快速建立起場景。
繼承自 HandTrackingGraph 的 GestureTrackingGraph,最主要只是要將分析後的手部骨骼資訊傳遞出來做進一步的分析,使用一樣的 HandTrackingAnnotationController 來做辨識影像的視覺化,其中會需要用到命名為 ResourceManager 的 GameObject,需要在 ResourceManager 裡面加上 LocalAssetLoader component 即可。
GestureTrackingDirector 我們的 root controller 負責以上物件的溝通與傳遞。
GestureText 是用來顯示最後辨識出來的手勢結果。
架構準備好以後就可以來實作細節了 ✌🏻
實作細節
GestureTrackingGraph.cs
這裡主要只是要覆寫 RenderOutput,將獲取的分析結果透過 OnHandTrackingValueFetched 事件傳遞出來。
事先必須先去修改一下 HandTrackingGraph 中的 FetchNextHandTrackingValue 與 RenderAnnotation 方法,從 private 改成 protected 讓 GestureTrackingGraph 可以呼叫得到。
public class GestureTrackingGraph : HandTrackingGraph { public delegate void GestureTrackingValueEvent (HandTrackingValue handTrackingValue);
public event GestureTrackingValueEvent OnHandTrackingValueFetched = (h) => { }; public override void RenderOutput (WebCamScreenController screenController, TextureFrame textureFrame) { var handTrackingValue = FetchNextHandTrackingValue (); RenderAnnotation (screenController, handTrackingValue);
screenController.DrawScreen (textureFrame); OnHandTrackingValueFetched (handTrackingValue);
}
}
接著我們想要透過 HandTrackingValue 回傳資料中的 NormalizedLandmarkList (手部骨骼的資訊)來判斷每隻手指的開合狀態,所以新增一個類別 HandState 來處理,並透過 enum FingerState 來表達每隻手指的個別狀態:
[Flags]
public enum FingerState { Closed = 0,
ThumbOpen = 1,
IndexOpen = 2,
MiddleOpen = 4,
RingOpen = 8,
PinkyOpen = 16,
}
NormalizedLandmarkList 的相關資訊可以參考官方文件或下圖:
public class HandState { [Flags]
public enum FingerState {…} public delegate void HandStateEvent (FingerState previousState, FingerState currentState);
public event HandStateEvent OnStateChanged = (p, c) => { }; FingerState m_FingerState; public void Process (Mediapipe.NormalizedLandmarkList landmarkList) {
FingerState fingerState = FingerState.Closed; /* Analyse Fingers */ if (m_FingerState != fingerState) {
OnStateChanged (m_FingerState, fingerState);
m_FingerState = fingerState;
}
}
}
至於手指開合如何判斷呢?我們這邊單純用很簡易的幾何位置來分析,並指針對右手做判斷(拇指需判斷是手心還是手背面向鏡頭):
float pseudoFixKeyPoint = landmarkList.Landmark [2].X;if ((landmarkList.Landmark [0].X > landmarkList.Landmark [1].X && landmarkList.Landmark [3].X < pseudoFixKeyPoint && landmarkList.Landmark [4].X < pseudoFixKeyPoint) ||
(landmarkList.Landmark [0].X < landmarkList.Landmark [1].X && landmarkList.Landmark [3].X > pseudoFixKeyPoint && landmarkList.Landmark [4].X > pseudoFixKeyPoint)) {
fingerState |= FingerState.ThumbOpen;
}pseudoFixKeyPoint = landmarkList.Landmark [6].Y;
if (landmarkList.Landmark [7].Y < pseudoFixKeyPoint && landmarkList.Landmark [8].Y < pseudoFixKeyPoint) {
fingerState |= FingerState.IndexOpen;
}pseudoFixKeyPoint = landmarkList.Landmark [10].Y;
if (landmarkList.Landmark [11].Y < pseudoFixKeyPoint && landmarkList.Landmark [12].Y < pseudoFixKeyPoint) {
fingerState |= FingerState.MiddleOpen;
}pseudoFixKeyPoint = landmarkList.Landmark [14].Y;
if (landmarkList.Landmark [15].Y < pseudoFixKeyPoint && landmarkList.Landmark [16].Y < pseudoFixKeyPoint) {
fingerState |= FingerState.RingOpen;
}pseudoFixKeyPoint = landmarkList.Landmark [18].Y;
if (landmarkList.Landmark [19].Y < pseudoFixKeyPoint && landmarkList.Landmark [20].Y < pseudoFixKeyPoint) {
fingerState |= FingerState.PinkyOpen;
}
可惜這裡是假設手是正擺的情形,如果是倒過來的話就會判斷錯誤(手心朝 y+,手指朝 y-),如果要解決旋轉的問題,可以透過 0, 5, 17 三個點位當參考點,先算出手掌的旋轉矩陣,再將所有其他點位轉換成正擺的位置即可,之後再做 😂
這樣基本上就可以簡單判斷出每隻手指的開合,我們可以運用開合來組合成不同的手勢,以下為 GestureAnalyzer:
public static class GestureAnalyzer { public static MeaningfulGesture Analyze (this HandState.FingerState state) {
/* Analyze Gesture */
}
定義有意義的手勢 MeaningfulGesture:
public enum MeaningfulGesture { None,
Hold,
One,
Two,
Three,
Four,
Five,
Six,
Seven,
Eight,
Nine,
Rock,
Spiderman,
FuckQ
}
接下來就很簡單了,比如說 Seven 就是 FingerState.ThumbOpen | FingerState.IndexOpen、Eight 就是 FingerState.ThumbOpen | FingerState.IndexOpen |FingerState.MiddleOpen:
if (state == (
HandState.FingerState.ThumbOpen |
HandState.FingerState.IndexOpen |
HandState.FingerState.MiddleOpen |
HandState.FingerState.RingOpen)) { return MeaningfulGesture.Nine;} else if (state == (
HandState.FingerState.ThumbOpen |
HandState.FingerState.IndexOpen |
HandState.FingerState.MiddleOpen)) { return MeaningfulGesture.Eight;} else if (state == (
HandState.FingerState.ThumbOpen |
HandState.FingerState.IndexOpen)) { return MeaningfulGesture.Seven;} else if (state == (
HandState.FingerState.ThumbOpen |
HandState.FingerState.PinkyOpen)) { return MeaningfulGesture.Six;} else if (state == (
HandState.FingerState.ThumbOpen |
HandState.FingerState.IndexOpen |
HandState.FingerState.MiddleOpen |
HandState.FingerState.RingOpen |
HandState.FingerState.PinkyOpen)) { return MeaningfulGesture.Five;} else if (state == (
HandState.FingerState.IndexOpen |
HandState.FingerState.MiddleOpen |
HandState.FingerState.RingOpen |
HandState.FingerState.PinkyOpen)) { return MeaningfulGesture.Four;} else if (state == (
HandState.FingerState.IndexOpen |
HandState.FingerState.MiddleOpen |
HandState.FingerState.RingOpen)) { return MeaningfulGesture.Three;} else if (state == (
HandState.FingerState.IndexOpen |
HandState.FingerState.MiddleOpen)) { return MeaningfulGesture.Two;} else if (state == (
HandState.FingerState.IndexOpen)) { return MeaningfulGesture.One;} else if (state == (
HandState.FingerState.IndexOpen |
HandState.FingerState.PinkyOpen)) { return MeaningfulGesture.Rock;} else if (state == (
HandState.FingerState.ThumbOpen |
HandState.FingerState.IndexOpen |
HandState.FingerState.PinkyOpen)) { return MeaningfulGesture.Spiderman;} else if (state == (
HandState.FingerState.Closed)) { return MeaningfulGesture.Hold;} else if (state == (
HandState.FingerState.MiddleOpen)) { return MeaningfulGesture.FuckQ;} else { return MeaningfulGesture.None;
}
這些都準備好以後,我們就可以透過 GestureTrackingDirector 把大家兜起來了!
- 這邊沒有特別做 WebCamTexture.devices 的選取功能,直接使用抓到的第一台 webcam。
- 只針對偵測到的第一隻手做手勢分析。
public class GestureTrackingDirector : MonoBehaviour { [SerializeField] WebCamScreenController m_WebCamScreenCtr;
[SerializeField] GestureTrackingGraph m_Graph;
[SerializeField] Text m_GestureText; HandState m_Gesture = new HandState (); void Start () { m_Graph.OnHandTrackingValueFetched += HandleOnHandTrackingValueFetched;
m_Gesture.OnStateChanged += HandleOnStateChanged; StartCoroutine (InitCamera ());
} #region Initialization
IEnumerator InitCamera () { yield return StartCoroutine (m_WebCamScreenCtr.ResetScreen (WebCamTexture.devices [0])); if (m_WebCamScreenCtr.isPlaying) {
StartCoroutine (RequestFrame ());
}
} IEnumerator RequestFrame () { m_Graph.Initialize ();
m_Graph.StartRun (m_WebCamScreenCtr.GetScreen ()); while (true) {
yield return new WaitForEndOfFrame (); var nextFrameRequest = m_WebCamScreenCtr.RequestNextFrame ();
yield return nextFrameRequest; var nextFrame = nextFrameRequest.textureFrame;
m_Graph.PushInput (nextFrame);
m_Graph.RenderOutput (m_WebCamScreenCtr, nextFrame);
}
}
#endregion #region Event Handlers
private void HandleOnStateChanged (HandState.FingerState previousState, HandState.FingerState currentState) { var gesture = currentState.Analyze ();
m_GestureText.text = currentState.Analyze ().ToString ();
} private void HandleOnHandTrackingValueFetched (HandTrackingValue handTrackingValue) { if (handTrackingValue.HandLandmarkLists.Count > 0) {
m_Gesture.Process (handTrackingValue.HandLandmarkLists [0]);
}
}
#endregion
}
基本上這樣就完成了囉~