Reputation: 6082
I'm new in Unity 3d, And developing one App. In that there is features of "automatic" lip syncing.
I'm following below tutorial
http://answers.unity3d.com/questions/139323/any-way-of-quotautomaticquot-lip-syncing.html
And look at below my code
using UnityEngine;
using System.Collections;
public class lipmovement2: MonoBehaviour
{
// Use this for initialization
/*Class for implementing Lips Syncronisation*/
public AudioClip source_clip;
public float[] freqData;
int nSamples = 256;
int fMax = 24000;
public Transform upmouth0_M, upmouth01_L, upmouth02_R, downmouth1_M, downmouth11_L, downmouth12_R;
float volume = 1000;
// float freqLow = 200;
// float freqHigh = 800;
//value change
float freqLow = 200;
float freqHigh = 1600;
int sizeFilter = 5;
float[] filter;
float filterSum;
int posFilter = 0;
int qSample = 0;
int video_Length, secCounter;
float y0, y1;
void OnEnable ()
{
secCounter = 0;
// y0 = mouth0.localPosition.y;
// y1 = mouth1.localPosition.y;
y0 = upmouth0_M.localPosition.y;
y0 = upmouth01_L.localPosition.y;
y0 = upmouth02_R.localPosition.y;
y1 = downmouth1_M.localPosition.y;
y1 = downmouth11_L.localPosition.y;
y1 = downmouth12_R.localPosition.y;
freqData = new float[nSamples];
//source_clip = SetFace.voiceOver;
GetComponent<AudioSource> ().clip = Rec_voice.instance.voiceFeed.clip;
GetComponent<AudioSource> ().Play ();
video_Length = Mathf.CeilToInt (source_clip.length);
}
float BandVol (float fLow, float fHigh)
{
fLow = Mathf.Clamp (fLow, 20, fMax);
fHigh = Mathf.Clamp (fHigh, fLow, fMax);
GetComponent<AudioSource> ().GetSpectrumData (freqData, 0, FFTWindow.BlackmanHarris);
int n1 = Mathf.FloorToInt (fLow * nSamples / fMax);
int n2 = Mathf.FloorToInt (fHigh * nSamples / fMax);
float sum = 0;
for (int i = n1; i <= n2; i++) {
sum = freqData [i];
}
return sum;
}
float MovingAverage (float sample)
{
if (qSample == 0)
filter = new float[sizeFilter];
filterSum += sample - filter [posFilter];
filter [posFilter++] = sample;
if (posFilter > qSample) {
qSample = posFilter;
}
posFilter = posFilter % sizeFilter;
return filterSum / qSample;
}
void Start ()
{
/*secCounter = 0;
y0 = mouth0.localPosition.y;
y1 = mouth1.localPosition.y;
freqData = new float[nSamples];
//source_clip = SetFace.voiceOver;
GetComponent<AudioSource> ().clip = Rec_voice.instance.voiceOver;
GetComponent<AudioSource> ().Play ();
video_Length = Mathf.CeilToInt (source_clip.length);
*/
//Debug.Log (y0);
// DebugConsole.Log (y0.ToString ());
// Debug.Log (Application.persistentDataPath);
/*StartCoroutine (Timer ());
StartCoroutine (recordScreen ());
*/
}
/* IEnumerator Timer ()
{
while (secCounter < video_Length) {
yield return new WaitForSeconds (1f);
secCounter += 1;
}
}*/
float limValue;
// Update is called once per frame
void Update ()
{
float band_vol = BandVol (freqLow, freqHigh);
float val = MovingAverage (band_vol) * volume;
//limValue = val;//Mathf.Clamp (val, 0, 0.1f);
//limValue = Mathf.Clamp (val, 0, 10f);
//check new lip movement abd set clamp val
limValue = Mathf.Clamp (val, 0, 25f);
//Debug.Log (y0 - limValue);
if (Input.GetKeyDown (KeyCode.Escape)) {
Application.Quit ();
}
/* mouth0.position = new Vector3 (mouth0.position.x, y0 - MovingAverage (band_vol) * volume, mouth0.position.z);
mouth1.position = new Vector3 (mouth1.position.x, y1 + MovingAverage (band_vol) * volume * 0.3f, mouth1.position.z);*/
}
void LateUpdate ()
{
// mouth0.localPosition = new Vector3 (mouth0.localPosition.x, y0 - limValue, mouth0.localPosition.z);
// mouth1.localPosition = new Vector3 (mouth1.localPosition.x, y1 + limValue, mouth1.localPosition.z);
upmouth0_M.localPosition = new Vector3 (upmouth0_M.localPosition.x, y0 - limValue, upmouth0_M.localPosition.z);
upmouth01_L.localPosition = new Vector3 (upmouth01_L.localPosition.x, y0 - limValue, upmouth01_L.localPosition.z);
upmouth02_R.localPosition = new Vector3 (upmouth02_R.localPosition.x, y0 - limValue, upmouth02_R.localPosition.z);
downmouth1_M.localPosition = new Vector3 (downmouth1_M.localPosition.x, y1 + limValue, downmouth1_M.localPosition.z);
downmouth11_L.localPosition = new Vector3 (downmouth11_L.localPosition.x, y1 + limValue, downmouth11_L.localPosition.z);
downmouth12_R.localPosition = new Vector3 (downmouth12_R.localPosition.x, y1 + limValue, downmouth12_R.localPosition.z);
}
}
Here I'm facing some issue like below
1) How to recognise voice of Human? : Because if another voice like music or etc then it will be detected so how can we stop? I want lips will syncing only for human voice.
2) When I recording if distance is close to device then it working perfectly but if distance is little more away then lips is not syncing.
So suggest me where I'm going to wrong? and how to solve above issues ?
Upvotes: 3
Views: 2124
Reputation: 301
2) Sound level recorded by mic decrease with distance. Thus, there will be less energy on each frequency band (i.e. values given by GetSpectrumData are smaller). If you increase the value of the 'volume' parameter, then val becomes bigger in
float val = MovingAverage (band_vol) * volume;
...and lips will move more along y-axis.
1) A simple algorithm would just look at frequency data and classify input as speech if there is enough noise in lower bands (say 0-1000Hz) compared to whole spectrum (say 0-16000Hz). This would perhaps prevent algorithm from doing lip sync in random noise. For more advanced needs, I would implement MFCC algorithm. Then I would train the algorithm with common phonemes and do the lip sync if MFCC calculated from recorded audio stream is close enough to training data.
Upvotes: 1