2009 |
Jon Gudnason; Mark R P Thomas; Patrick A Naylor; Dan P W Ellis Proc. Interspeech Conf., Brighton, UK, 2009. @conference{Gudnason2009b, title = {Voice Source Waveform Analysis and Synthesis using Principal Component Analysis and Gaussian Mixture Modelling}, author = {Jon Gudnason and Mark R P Thomas and Patrick A Naylor and Dan P W Ellis}, url = {https://www.ee.columbia.edu/~dpwe/pubs/GudTNE09-voicesource.pdf}, year = {2009}, date = {2009-09-01}, booktitle = {Proc. Interspeech Conf.}, address = {Brighton, UK}, abstract = {The paper presents a voice source waveform modeling techniques based on principal component analysis (PCA) and Gaussian mixture modeling (GMM). The voice source is obtained by inverse-filteirng speech with the estimated vocal tract filter. This decomposition is useful in speech analysis, synthesis, recognition and coding. Existing models of the voice source signal are based on function-fitting or physically motivated assumptions and although they are well defined, estimation of their parameters is not well understood and few are capable of reproducing the large variety of voice source waveforms. Here, a data-driven approach is presented for signal decomposition and classification based on the principal components of the voice source. The principal components are analyzed and the ‘prototype’ voice source signals corresponding to the Gaussian mixture means are examined. We show how an unknown signal can be decomposed into its components and/or prototypes and resynthesized. We show how the techniques are suited for both low bitrate or high quality analysis/synthesis schemes.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } The paper presents a voice source waveform modeling techniques based on principal component analysis (PCA) and Gaussian mixture modeling (GMM). The voice source is obtained by inverse-filteirng speech with the estimated vocal tract filter. This decomposition is useful in speech analysis, synthesis, recognition and coding. Existing models of the voice source signal are based on function-fitting or physically motivated assumptions and although they are well defined, estimation of their parameters is not well understood and few are capable of reproducing the large variety of voice source waveforms. Here, a data-driven approach is presented for signal decomposition and classification based on the principal components of the voice source. The principal components are analyzed and the ‘prototype’ voice source signals corresponding to the Gaussian mixture means are examined. We show how an unknown signal can be decomposed into its components and/or prototypes and resynthesized. We show how the techniques are suited for both low bitrate or high quality analysis/synthesis schemes. |
M R P Thomas; J Gudnason; P A Naylor Detection of Glottal Closing and Opening Instants Using the Improved DYPSA Framework Inproceedings Proc. European Signal Processing Conf. (EUSIPCO), Glasgow, Scotland, 2009. @inproceedings{Thomas2009c, title = {Detection of Glottal Closing and Opening Instants Using the Improved DYPSA Framework}, author = {M R P Thomas and J Gudnason and P A Naylor}, url = {http://www.eurasip.org/Proceedings/Eusipco/Eusipco2009/contents/papers/1569192150.pdf}, year = {2009}, date = {2009-08-01}, booktitle = {Proc. European Signal Processing Conf. (EUSIPCO)}, address = {Glasgow, Scotland}, abstract = {Accurate estimation of glottal closure instants (GCIs) and opening instants (GOIs) is important for speech processing applications that benefit from glottal-synchronous processing. This paper proposes a novel improvement to the DYPSA framework, based upon a multiscale analysis technique and an accurate estimation of glottal volume velocity. This replaces the linear prediction residual for candidate selection and enables the reliable detection of both GCI and GOI candidates. A two-stage dynamic programming process then detects the GCIs and removes them from the candidate set, before detecting GOIs from the remaining candidates. A post-processing step improves GOI detection using the estimated GCIs. Evaluation against hand-labelled data on a large speech database shows that GCI detection is marginally improved compared with original DYPSA at 96% but, more importantly, shows that GOI detection can be achieved to a similar accuracy of 95%.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Accurate estimation of glottal closure instants (GCIs) and opening instants (GOIs) is important for speech processing applications that benefit from glottal-synchronous processing. This paper proposes a novel improvement to the DYPSA framework, based upon a multiscale analysis technique and an accurate estimation of glottal volume velocity. This replaces the linear prediction residual for candidate selection and enables the reliable detection of both GCI and GOI candidates. A two-stage dynamic programming process then detects the GCIs and removes them from the candidate set, before detecting GOIs from the remaining candidates. A post-processing step improves GOI detection using the estimated GCIs. Evaluation against hand-labelled data on a large speech database shows that GCI detection is marginally improved compared with original DYPSA at 96% but, more importantly, shows that GOI detection can be achieved to a similar accuracy of 95%. |
M R P Thomas; J Gudnason; P A Naylor Data-Driven Voice Source Waveform Modelling Inproceedings Proc. IEEE Intl. Conf. on Acoustics, Speech and Signal Processing (ICASSP), Taipei, Taiwan, 2009. @inproceedings{Thomas2009a, title = {Data-Driven Voice Source Waveform Modelling}, author = {M R P Thomas and J Gudnason and P A Naylor}, url = {http://ieeexplore.ieee.org/document/4960496/}, year = {2009}, date = {2009-04-01}, booktitle = {Proc. IEEE Intl. Conf. on Acoustics, Speech and Signal Processing (ICASSP)}, address = {Taipei, Taiwan}, abstract = {This paper presents a data-driven approach to the modelling of voice source waveforms. The voice source is a signal that is estimated by inverse-filtering speech signals with an estimate of the vocal tract filter. It is used in speech analysis, synthesis, recognition and coding to decompose a speech signal into its source and vocal tract filter components. Existing approaches parameterize the voice source signal with physically- or mathematically-motivated models. Though the models are well-defined, estimation of their parameters is not well understood and few are capable of reproducing the large variety of voice source waveforms. Here we present a data-driven approach to classify types of voice source waveforms based upon their melfrequency cepstrum coefficients with Gaussian mixture modelling. A set of “prototype” waveform classes is derived from a weighted average of voice source cycles from real data. An unknown speech signal is then decomposed into its prototype components and resynthesized. Results indicate that with sixteen voice source classes, low resynthesis errors can be achieved.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } This paper presents a data-driven approach to the modelling of voice source waveforms. The voice source is a signal that is estimated by inverse-filtering speech signals with an estimate of the vocal tract filter. It is used in speech analysis, synthesis, recognition and coding to decompose a speech signal into its source and vocal tract filter components. Existing approaches parameterize the voice source signal with physically- or mathematically-motivated models. Though the models are well-defined, estimation of their parameters is not well understood and few are capable of reproducing the large variety of voice source waveforms. Here we present a data-driven approach to classify types of voice source waveforms based upon their melfrequency cepstrum coefficients with Gaussian mixture modelling. A set of “prototype” waveform classes is derived from a weighted average of voice source cycles from real data. An unknown speech signal is then decomposed into its prototype components and resynthesized. Results indicate that with sixteen voice source classes, low resynthesis errors can be achieved. |
M R P Thomas; P A Naylor The SIGMA Algorithm: A Glottal Activity Detector for Electroglottographic Signals Journal Article IEEE Trans. Audio, Speech, Lang. Process., 17 (8), pp. 1557–1566, 2009. @article{Thomas2009b, title = {The SIGMA Algorithm: A Glottal Activity Detector for Electroglottographic Signals}, author = {M R P Thomas and P A Naylor}, url = {http://ieeexplore.ieee.org/document/4912310/}, doi = {http://dx.doi.org/10.1109/TASL.2009.2022430}, year = {2009}, date = {2009-01-01}, journal = {IEEE Trans. Audio, Speech, Lang. Process.}, volume = {17}, number = {8}, pages = {1557--1566}, abstract = {Accurate estimation of glottal closure instants (GCIs) and opening instants (GOIs) is important for speech processing applications that benefit from glottal-synchronous processing. The majority of existing approaches detect GCIs by comparing the differentiated EGG signal to a threshold and are able to provide accurate results during voiced speech. More recent algorithms use a similar approach across multiple dyadic scales using the stationary wavelet transform. All existing approaches are however prone to errors around the transition regions at the end of voiced segments of speech. This paper describes a new method for EGG-based glottal activity detection which exhibits high accuracy over the entirety of voiced segments, including, in particular, the transition regions, thereby giving significant improvement over existing methods. Following a stationary wavelet transform-based preprocessor, detection of excitation due to glottal closure is performed using a group delay function and then true and false detections are discriminated by Gaussian mixture modeling. GOI detection involves additional processing using the estimated GCIs. The main purpose of our algorithm is to provide a ground-truth for GCIs and GOIs. This is essential in order to evaluate algorithms that estimate GCIs and GOIs from the speech signal only, and is also of high value in the analysis of pathological speech where knowledge of GCIs and GOIs is often needed. We compare our algorithm with two previous algorithms against a hand-labeled database. Evaluation has shown an average GCI hit rate of 99.47% and GOI of 99.35%, compared to 96.08 and 92.54 for the best-performing existing algorithm.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Accurate estimation of glottal closure instants (GCIs) and opening instants (GOIs) is important for speech processing applications that benefit from glottal-synchronous processing. The majority of existing approaches detect GCIs by comparing the differentiated EGG signal to a threshold and are able to provide accurate results during voiced speech. More recent algorithms use a similar approach across multiple dyadic scales using the stationary wavelet transform. All existing approaches are however prone to errors around the transition regions at the end of voiced segments of speech. This paper describes a new method for EGG-based glottal activity detection which exhibits high accuracy over the entirety of voiced segments, including, in particular, the transition regions, thereby giving significant improvement over existing methods. Following a stationary wavelet transform-based preprocessor, detection of excitation due to glottal closure is performed using a group delay function and then true and false detections are discriminated by Gaussian mixture modeling. GOI detection involves additional processing using the estimated GCIs. The main purpose of our algorithm is to provide a ground-truth for GCIs and GOIs. This is essential in order to evaluate algorithms that estimate GCIs and GOIs from the speech signal only, and is also of high value in the analysis of pathological speech where knowledge of GCIs and GOIs is often needed. We compare our algorithm with two previous algorithms against a hand-labeled database. Evaluation has shown an average GCI hit rate of 99.47% and GOI of 99.35%, compared to 96.08 and 92.54 for the best-performing existing algorithm. |
2008 |
M R P Thomas; J Gudnason; P A Naylor Application of the DYPSA Algorithm to Segmented Time-Scale Modification of Speech Inproceedings Proc. European Signal Processing Conf. (EUSIPCO), Lausanne, Switzerland, 2008. @inproceedings{Thomas2008b, title = {Application of the DYPSA Algorithm to Segmented Time-Scale Modification of Speech}, author = {M R P Thomas and J Gudnason and P A Naylor}, url = {http://ieeexplore.ieee.org/document/7080347/}, year = {2008}, date = {2008-08-01}, booktitle = {Proc. European Signal Processing Conf. (EUSIPCO)}, address = {Lausanne, Switzerland}, abstract = {This paper presents a method for speech time scale modification. Voiced speech is pseudo-periodic, allowing time scale modification by the repetition or removal of cycles as necessary. However, in the case of unvoiced speech and at the boundaries of voiced speech, no such periodicity exists so the speech should not be modified. To address this issue, the proposed approach is novel in its use of the DYPSA algorithm to derive speech periodicity from glottal closure instants (GCIs), followed by a Gaussian Mixture model-based voiced/unvoiced/silence (VUS) classifier. A listening test based on ITU-T P800 has been conducted and has shown that, by employing VUS detection, the average mean opinion score of the perceptual quality of processed speech exceeds that of a method without VUS detection by 0.61 over a range of modification factors. Results are presented as a function of modification factor for normal and fast original talking rate. Reliable time scale modification of high audio quality enables many applications, such as time scale compression for fast scanning of recorded voicemail messages, slowing talking rate for improved intelligibility in forensics and lip synchronization in motion video.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } This paper presents a method for speech time scale modification. Voiced speech is pseudo-periodic, allowing time scale modification by the repetition or removal of cycles as necessary. However, in the case of unvoiced speech and at the boundaries of voiced speech, no such periodicity exists so the speech should not be modified. To address this issue, the proposed approach is novel in its use of the DYPSA algorithm to derive speech periodicity from glottal closure instants (GCIs), followed by a Gaussian Mixture model-based voiced/unvoiced/silence (VUS) classifier. A listening test based on ITU-T P800 has been conducted and has shown that, by employing VUS detection, the average mean opinion score of the perceptual quality of processed speech exceeds that of a method without VUS detection by 0.61 over a range of modification factors. Results are presented as a function of modification factor for normal and fast original talking rate. Reliable time scale modification of high audio quality enables many applications, such as time scale compression for fast scanning of recorded voicemail messages, slowing talking rate for improved intelligibility in forensics and lip synchronization in motion video. |
M R P Thomas; P A Naylor The SIGMA Algorithm for Estimation of Reference-Quality Glottal Closure Instants from Electroglottograph Signals Inproceedings Proc. European Signal Processing Conf. (EUSIPCO), Lausanne, Switzerland, 2008. @inproceedings{Thomas2008c, title = {The SIGMA Algorithm for Estimation of Reference-Quality Glottal Closure Instants from Electroglottograph Signals}, author = {M R P Thomas and P A Naylor}, url = {http://ieeexplore.ieee.org/document/7080348/}, year = {2008}, date = {2008-08-01}, booktitle = {Proc. European Signal Processing Conf. (EUSIPCO)}, address = {Lausanne, Switzerland}, abstract = {Accurate estimation of glottal closure instants (GCIs) in voiced speech is important for speech analysis applications which benefit from glottal-synchronous processing. Electroglottograph (EGG) recordings give a measure of the electrical conductance of the glottis, providing a signal which is proportional to its contact area. EGG signals contain little noise or distortion, providing a good reference from which GCIs can be extracted to evaluate GCI estimation from speech recordings. Many approaches impose a threshold on the differentiated EGG signal which provide accurate results during voiced speech but are prone to errors at the onset and end of voicing; modern algorithms use a similar approach across multiple dyadic scales using the stationary wavelet transform. This paper describes a new method for EGG-based GCI estimation named SIGMA, which is based upon the stationary wavelet transform, peak detection with a group delay function and Gaussian Mixture Modelling for discrimination between true and false GCI candidates. In most real-world environments, it is necessary to estimate GCIs from a speech signal recorded with a microphone placed at some distance from the talker. The presence of reverberation, noise and filtering by the vocal tract render GCI detection from real speech signals relatively difficult to achieve compared with the EGG, so EGG-based references have often been used to evaluate GCI detection from speech signals. Evaluation against 500 hand-labelled sentences has shown an accuracy of 99.35%, a 4.7% improvement over a popular existing method.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Accurate estimation of glottal closure instants (GCIs) in voiced speech is important for speech analysis applications which benefit from glottal-synchronous processing. Electroglottograph (EGG) recordings give a measure of the electrical conductance of the glottis, providing a signal which is proportional to its contact area. EGG signals contain little noise or distortion, providing a good reference from which GCIs can be extracted to evaluate GCI estimation from speech recordings. Many approaches impose a threshold on the differentiated EGG signal which provide accurate results during voiced speech but are prone to errors at the onset and end of voicing; modern algorithms use a similar approach across multiple dyadic scales using the stationary wavelet transform. This paper describes a new method for EGG-based GCI estimation named SIGMA, which is based upon the stationary wavelet transform, peak detection with a group delay function and Gaussian Mixture Modelling for discrimination between true and false GCI candidates. In most real-world environments, it is necessary to estimate GCIs from a speech signal recorded with a microphone placed at some distance from the talker. The presence of reverberation, noise and filtering by the vocal tract render GCI detection from real speech signals relatively difficult to achieve compared with the EGG, so EGG-based references have often been used to evaluate GCI detection from speech signals. Evaluation against 500 hand-labelled sentences has shown an accuracy of 99.35%, a 4.7% improvement over a popular existing method. |
2007 |
N D Gaubitch; M R P Thomas; P A Naylor Subband Method for Multichannel Least Squares Equalization of Room Transfer Functions Inproceedings Proc. IEEE Workshop on Applications of Signal Processing to Audio and Acoustics, New Paltz, New York, 2007. @inproceedings{Gaubitch2007a, title = {Subband Method for Multichannel Least Squares Equalization of Room Transfer Functions}, author = {N D Gaubitch and M R P Thomas and P A Naylor}, url = {http://ieeexplore.ieee.org/document/4392981/}, year = {2007}, date = {2007-10-01}, booktitle = {Proc. IEEE Workshop on Applications of Signal Processing to Audio and Acoustics}, address = {New Paltz, New York}, abstract = {Equalization of room transfer functions (RTFs) is important in many speech and audio processing applications. It is a challenging problem because RTFs are several thousand taps long and non-minimum phase and in practice only approximate measurements of the RTFs are available. In this paper, we present a subband multichannel least squares method for equalization of RTFs which is computationally efficient and less sensitive to inaccuracies in the measured RTFs compared to its fullband counterpart. Experimental results using simulated impulse responses demonstrate the performance of the algorithm.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Equalization of room transfer functions (RTFs) is important in many speech and audio processing applications. It is a challenging problem because RTFs are several thousand taps long and non-minimum phase and in practice only approximate measurements of the RTFs are available. In this paper, we present a subband multichannel least squares method for equalization of RTFs which is computationally efficient and less sensitive to inaccuracies in the measured RTFs compared to its fullband counterpart. Experimental results using simulated impulse responses demonstrate the performance of the algorithm. |
M R P Thomas; N D Gaubitch; J Gudnason; P A Naylor A Practical Multichannel Dereverberation Algorithm Using Multichannel DYPSA and Spatiotemporal Averaging Inproceedings Proc. IEEE Workshop on Applications of Signal Processing to Audio and Acoustics, New Paltz, NY, 2007. @inproceedings{Thomas2007b, title = {A Practical Multichannel Dereverberation Algorithm Using Multichannel DYPSA and Spatiotemporal Averaging}, author = {M R P Thomas and N D Gaubitch and J Gudnason and P A Naylor}, url = {http://ieeexplore.ieee.org/document/4392983/}, year = {2007}, date = {2007-10-01}, booktitle = {Proc. IEEE Workshop on Applications of Signal Processing to Audio and Acoustics}, address = {New Paltz, NY}, abstract = {Speech signals for hands-free telecommunication applications are received by one or more microphones placed at some distance from the talker. In an office environment, for example, unwanted signals such as reverberation and background noise from computers and other talkers will degrade the quality of the received signal. These unwanted components have an adverse effect upon speech processing algorithms and impair intelligibility. This paper demonstrates the use of the Multichannel DYPSA algorithm to identify glottal closure instants (GCIs) from noisy, reverberant speech. Using the estimated GCIs, a spatiotemporal averaging technique is applied to attenuate the unwanted components. Experiments with a microphone array demonstrate the dereverberation and noise suppression of the spatiotemporal averaging method, showing up to a 5 dB improvement in segmental SNR and 0.33 in normalized Bark spectral distortion score.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Speech signals for hands-free telecommunication applications are received by one or more microphones placed at some distance from the talker. In an office environment, for example, unwanted signals such as reverberation and background noise from computers and other talkers will degrade the quality of the received signal. These unwanted components have an adverse effect upon speech processing algorithms and impair intelligibility. This paper demonstrates the use of the Multichannel DYPSA algorithm to identify glottal closure instants (GCIs) from noisy, reverberant speech. Using the estimated GCIs, a spatiotemporal averaging technique is applied to attenuate the unwanted components. Experiments with a microphone array demonstrate the dereverberation and noise suppression of the spatiotemporal averaging method, showing up to a 5 dB improvement in segmental SNR and 0.33 in normalized Bark spectral distortion score. |
M R P Thomas; N D Gaubitch; P A Naylor Multichannel DYPSA for estimation of glottal closure instants in reverberant speech Inproceedings Proc. European Signal Processing Conf. (EUSIPCO), Poznan, Poland, 2007. @inproceedings{Thomas2007a, title = {Multichannel DYPSA for estimation of glottal closure instants in reverberant speech}, author = {M R P Thomas and N D Gaubitch and P A Naylor}, url = {http://ieeexplore.ieee.org/document/7099170/}, year = {2007}, date = {2007-09-01}, booktitle = {Proc. European Signal Processing Conf. (EUSIPCO)}, address = {Poznan, Poland}, abstract = {Identification of glottal closure instants (GCIs) is important in speech applications which benefit from larynx-synchronous processing. In modern telecommunication applications, speech signals are often obtained inside office rooms, with one or more microphones placed at a distance from the talker. Such speech signals are affected by reverberation due to the reflections from surrounding walls and objects, which distort the observed speech signals and degrade the performance of speech processing algorithms. This paper presents a study of the identifiability of GCIs from reverberant speech using the Dynamic Programming Projected Phase-Slope Algorithm (DYPSA) and new extensions to the multimicrophone case. Two multichannel algorithms are proposed and evaluated; in both cases, considerable performance gains over a single microphone are obtained, with detection rates improved by up to 29% in highly reverberant environments.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Identification of glottal closure instants (GCIs) is important in speech applications which benefit from larynx-synchronous processing. In modern telecommunication applications, speech signals are often obtained inside office rooms, with one or more microphones placed at a distance from the talker. Such speech signals are affected by reverberation due to the reflections from surrounding walls and objects, which distort the observed speech signals and degrade the performance of speech processing algorithms. This paper presents a study of the identifiability of GCIs from reverberant speech using the Dynamic Programming Projected Phase-Slope Algorithm (DYPSA) and new extensions to the multimicrophone case. Two multichannel algorithms are proposed and evaluated; in both cases, considerable performance gains over a single microphone are obtained, with detection rates improved by up to 29% in highly reverberant environments. |
2006 |
M R P Thomas A Novel Loudspeaker Equalizer Masters Thesis Imperial College London, 2006. @mastersthesis{Thomas2006, title = {A Novel Loudspeaker Equalizer}, author = {M R P Thomas}, url = {http://soundfieldanalysis.org/wp-content/uploads/2017/03/Thomas2006.pdf}, year = {2006}, date = {2006-05-01}, school = {Imperial College London}, abstract = {Fundamentally, loudspeaker design has changed very little in the past 30 years. The late 70s saw the introduction of electromechanical modelling techniques which remain the basis for the design and analysis of loudspeakers today. Aside from the use of lighter, stronger materials in the construction of drive units, the area which has undergone the most development is the design of equalisers. Classically, loudspeakers use a passive analogue filter network to compensate for a non-flat frequency response, but they are far from ideal and are prone to variation with temperature and age. Recent years have seen the use of linear DSP equalisers which conform to much tighter specifications. However, there are two major flaws in the design of even the latest equalisers. Firstly, it has long been known that at extreme voice coil excursion a loudspeaker is a nonlinear device, though no documented attempt at producing a nonlinear equaliser has yet been proposed. Secondly is the use of swept sine tones to characterise frequency responses, which can sometimes yield different results to methods where excitation energy is spread over a wider bandwidth (which is more akin to music or speech). Swept sines generally provide no phase information, which is an area of increasing interest in loudspeaker equalisation. This project investigates the analysis of loudspeaker frequency responses using a Maximum-Length Sequence (MLS) or M-Sequence Decorrelation technique, which uses pseudorandom noise to yield both amplitude and phase information. The measurements are used to model the frequency response over a wide range of amplitudes, fitting a Volterra-Series approximation and defining a level- dependent equaliser.}, keywords = {}, pubstate = {published}, tppubtype = {mastersthesis} } Fundamentally, loudspeaker design has changed very little in the past 30 years. The late 70s saw the introduction of electromechanical modelling techniques which remain the basis for the design and analysis of loudspeakers today. Aside from the use of lighter, stronger materials in the construction of drive units, the area which has undergone the most development is the design of equalisers. Classically, loudspeakers use a passive analogue filter network to compensate for a non-flat frequency response, but they are far from ideal and are prone to variation with temperature and age. Recent years have seen the use of linear DSP equalisers which conform to much tighter specifications. However, there are two major flaws in the design of even the latest equalisers. Firstly, it has long been known that at extreme voice coil excursion a loudspeaker is a nonlinear device, though no documented attempt at producing a nonlinear equaliser has yet been proposed. Secondly is the use of swept sine tones to characterise frequency responses, which can sometimes yield different results to methods where excitation energy is spread over a wider bandwidth (which is more akin to music or speech). Swept sines generally provide no phase information, which is an area of increasing interest in loudspeaker equalisation. This project investigates the analysis of loudspeaker frequency responses using a Maximum-Length Sequence (MLS) or M-Sequence Decorrelation technique, which uses pseudorandom noise to yield both amplitude and phase information. The measurements are used to model the frequency response over a wide range of amplitudes, fitting a Volterra-Series approximation and defining a level- dependent equaliser. |
Publications
2009 |
Proc. Interspeech Conf., Brighton, UK, 2009. |
Detection of Glottal Closing and Opening Instants Using the Improved DYPSA Framework Inproceedings Proc. European Signal Processing Conf. (EUSIPCO), Glasgow, Scotland, 2009. |
Data-Driven Voice Source Waveform Modelling Inproceedings Proc. IEEE Intl. Conf. on Acoustics, Speech and Signal Processing (ICASSP), Taipei, Taiwan, 2009. |
The SIGMA Algorithm: A Glottal Activity Detector for Electroglottographic Signals Journal Article IEEE Trans. Audio, Speech, Lang. Process., 17 (8), pp. 1557–1566, 2009. |
2008 |
Application of the DYPSA Algorithm to Segmented Time-Scale Modification of Speech Inproceedings Proc. European Signal Processing Conf. (EUSIPCO), Lausanne, Switzerland, 2008. |
The SIGMA Algorithm for Estimation of Reference-Quality Glottal Closure Instants from Electroglottograph Signals Inproceedings Proc. European Signal Processing Conf. (EUSIPCO), Lausanne, Switzerland, 2008. |
2007 |
Subband Method for Multichannel Least Squares Equalization of Room Transfer Functions Inproceedings Proc. IEEE Workshop on Applications of Signal Processing to Audio and Acoustics, New Paltz, New York, 2007. |
A Practical Multichannel Dereverberation Algorithm Using Multichannel DYPSA and Spatiotemporal Averaging Inproceedings Proc. IEEE Workshop on Applications of Signal Processing to Audio and Acoustics, New Paltz, NY, 2007. |
Multichannel DYPSA for estimation of glottal closure instants in reverberant speech Inproceedings Proc. European Signal Processing Conf. (EUSIPCO), Poznan, Poland, 2007. |
2006 |
A Novel Loudspeaker Equalizer Masters Thesis Imperial College London, 2006. |