from google.cloud import speech_v1p1beta1 as speech
def transcribe_with_model_adaptation(
project_id, location, storage_uri, custom_class_id, phrase_set_id
):
"""
Create`PhraseSet` to create custom lists of similar
items that are likely to occur in your input data.
"""
# Create the adaptation client
adaptation_client = speech.AdaptationClient()
# The parent resource where the custom class and phrase set will be created.
parent = f"projects/{project_id}/locations/{location}"
# Create the phrase set resource
phrase_set_response = adaptation_client.create_phrase_set(
{
"parent": parent,
"phrase_set_id": phrase_set_id,
"phrase_set": {
"boost": 10,
"phrases": [
{"value": "fare"}
],
},
}
)
phrase_set_name = phrase_set_response.name
# The next section shows how to use
# phrase set to send a transcription request with speech adaptation
# Speech adaptation configuration
speech_adaptation = speech.SpeechAdaptation(phrase_set_references=[phrase_set_name])
# speech configuration object
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=24000,
language_code="en-US",
adaptation=speech_adaptation,
)
# The name of the audio file to transcribe
# storage_uri URI for audio file in Cloud Storage, e.g. gs://[BUCKET]/[FILE]
audio = speech.RecognitionAudio(uri=storage_uri)
# Create the speech client
speech_client = speech.SpeechClient()
response = speech_client.recognize(config=config, audio=audio)
for result in response.results:
print("Transcript: {}".format(result.alternatives[0].transcript))
from google.cloud import speech_v1p1beta1 as speech
def transcribe_with_model_adaptation(
project_id, location, storage_uri, custom_class_id, phrase_set_id
):
"""
Create`CustomClasses` to create custom lists of similar
items that are likely to occur in your input data.
"""
# Create the adaptation client
adaptation_client = speech.AdaptationClient()
# The parent resource where the custom class and phrase set will be created.
parent = f"projects/{project_id}/locations/{location}"
# Create the phrase set resource
phrase_set_response = adaptation_client.create_phrase_set(
{
"parent": parent,
"phrase_set_id": phrase_set_id,
"phrase_set": {
"boost": 10,
"phrases": [
{"value": "my address is $ADDRESSNUM"},
{"value": "$ADDRESSNUM"}
],
},
}
)
phrase_set_name = phrase_set_response.name
# The next section shows how to use
# phrase set to send a transcription request with speech adaptation
# Speech adaptation configuration
speech_adaptation = speech.SpeechAdaptation(phrase_set_references=[phrase_set_name])
# speech configuration object
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=24000,
language_code="en-US",
adaptation=speech_adaptation,
)
# The name of the audio file to transcribe
# storage_uri URI for audio file in Cloud Storage, e.g. gs://[BUCKET]/[FILE]
audio = speech.RecognitionAudio(uri=storage_uri)
# Create the speech client
speech_client = speech.SpeechClient()
response = speech_client.recognize(config=config, audio=audio)
for result in response.results:
print("Transcript: {}".format(result.alternatives[0].transcript))
from google.cloud import speech_v1p1beta1 as speech
def transcribe_with_model_adaptation(
project_id, location, storage_uri, custom_class_id, phrase_set_id
):
"""
Create`PhraseSet` and `CustomClasses` to create custom lists of similar
items that are likely to occur in your input data.
"""
# Create the adaptation client
adaptation_client = speech.AdaptationClient()
# The parent resource where the custom class and phrase set will be created.
parent = f"projects/{project_id}/locations/{location}"
# Create the custom class resource
adaptation_client.create_custom_class(
{
"parent": parent,
"custom_class_id": custom_class_id,
"custom_class": {
"items": [
{"value": "sushido"},
{"value": "altura"},
{"value": "taneda"},
]
},
}
)
custom_class_name = (
f"projects/{project_id}/locations/{location}/customClasses/{custom_class_id}"
)
# Create the phrase set resource
phrase_set_response = adaptation_client.create_phrase_set(
{
"parent": parent,
"phrase_set_id": phrase_set_id,
"phrase_set": {
"boost": 10,
"phrases": [
{"value": f"Visit restaurants like ${{{custom_class_name}}}"}
],
},
}
)
phrase_set_name = phrase_set_response.name
# The next section shows how to use the newly created custom
# class and phrase set to send a transcription request with speech adaptation
# Speech adaptation configuration
speech_adaptation = speech.SpeechAdaptation(phrase_set_references=[phrase_set_name])
# speech configuration object
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=24000,
language_code="en-US",
adaptation=speech_adaptation,
)
# The name of the audio file to transcribe
# storage_uri URI for audio file in Cloud Storage, e.g. gs://[BUCKET]/[FILE]
audio = speech.RecognitionAudio(uri=storage_uri)
# Create the speech client
speech_client = speech.SpeechClient()
response = speech_client.recognize(config=config, audio=audio)
for result in response.results:
print("Transcript: {}".format(result.alternatives[0].transcript))
たとえば、「fare」よりも「fair」という単語が頻繁に出現する状況で、「fare to get into the county fair」と尋ねる録音が多くあるとします。この場合は、モデル適応を使用して、PhraseSet リソースに phrases として追加することにより、モデルが「fair」と「fare」の両方を認識する確率を向上できます。これは、「hare」や「lair」などよりも高い頻度で「fair」と「fare」と認識するように Speech-to-Text へ指示します。
ただし、「fair」は音声においてより頻繁に出現するため、「fare」よりも高い頻度で認識されなければなりません。すでに、Speech-to-Text API を使用して音声を文字変換し、正しい単語(「fair」)を認識し損ねるエラーを多数見つけているかもしれません。この場合、boost とともにフレーズをさらに使用して、「fare」よりも「fair」により高いブースト値を割り当てることをおすすめします。「fair」に割り当てる重み付け値を高くすると、Speech-to-Text API が「fare」よりも「fair」を頻繁に選択するようにバイアスが適用されます。ブースト値がなければ、認識モデルは「fair」と「fare」を同じ確率で認識します。
from google.cloud import speech_v1p1beta1 as speech
def sample_recognize(storage_uri, phrase):
"""
Transcribe a short audio file with speech adaptation.
Args:
storage_uri URI for audio file in Cloud Storage, e.g. gs://[BUCKET]/[FILE]
phrase Phrase "hints" help recognize the specified phrases from your audio.
"""
client = speech.SpeechClient()
# storage_uri = 'gs://cloud-samples-data/speech/brooklyn_bridge.mp3'
# phrase = 'Brooklyn Bridge'
phrases = [phrase]
# Hint Boost. This value increases the probability that a specific
# phrase will be recognized over other similar sounding phrases.
# The higher the boost, the higher the chance of false positive
# recognition as well. Can accept wide range of positive values.
# Most use cases are best served with values between 0 and 20.
# Using a binary search approach may help you find the optimal value.
boost = 20.0
speech_contexts_element = {"phrases": phrases, "boost": boost}
speech_contexts = [speech_contexts_element]
# Sample rate in Hertz of the audio data sent
sample_rate_hertz = 44100
# The language of the supplied audio
language_code = "en-US"
# Encoding of audio data sent. This sample sets this explicitly.
# This field is optional for FLAC and WAV audio formats
encoding = speech.RecognitionConfig.AudioEncoding.MP3
config = {
"speech_contexts": speech_contexts,
"sample_rate_hertz": sample_rate_hertz,
"language_code": language_code,
"encoding": encoding,
}
audio = {"uri": storage_uri}
response = client.recognize(config=config, audio=audio)
for result in response.results:
# First alternative is the most probable result
alternative = result.alternatives[0]
print(u"Transcript: {}".format(alternative.transcript))