Skip to content

Speech Recognition

Module Description

Speech Recognition

The Speech Recognition module transcribes spoken language into text (speech-to-text), it can detect the spoken language automatically, detect Named Entities and custom entities from a Dictionary and offers the translation of the transcript.

Module ID: speech_recognition

Module Parameters

Name Type Default Description
language string auto The language of the audio. The list of supported languages is available here. Can be set to auto if the language is not known and should be auto-detected.
mode string quality Processing mode for optimizing the inference for either quality or performance. (Available: quality, performance)
format_paragraph boolean true Formatting multiple segments into a paragraph of sentences. Disabling it produces shorter segments which are suitable for subtitles. It can be enabled if longer text segments are needed.
enable_vad boolean false Enable the voice activity detection (VAD) in order to improve the time-code accuracy.
vad_threshold number 0.6 Threshold for the voice activation.
enable_ner boolean false Apply named-entity recognition (NER) on transcribed segments.
detect_silence boolean false Detect and ignore the leading and the trailing silence in the audio.
merge_voice_gap_seconds number 0.5 Merge voice segments inbetween this threshold in seconds.
dump_srt boolean false Whether to output an SRT file with the transcript as job artifact.
underline_words_in_srt boolean false Underline the current word in the SRT file by using <u> tag.
dump_docx boolean false Whether to output a DOCX file with the transcript as job artifact.
translation_language string "none" Optional translation of the resulting transcript. See supported languages here.
dictionaries list of Dictionary Specification objects List of dictionaries to detect your own keywords/entities in the resulting transcription.


Send the following JSON as request body via POST to the /jobs/ endpoint:

  "sources": [
  "modules": {
    "speech_recognition": {
      "language": "german"

When requesting the job via GET on the /jobs/{JOB_ID}/ endpoint, the response looks like this:

  "id": "4d6c560e-0fc1-49ff-b190-6390e173c8c1",
  "tag": "",
  "state": "completed",
  "sources": [
  "modules": {
    "speech_recognition": {
      "language": "german",
      "translate_to_english": false,
      "enable_vad": false,
      "vad_threshold": 0.6,
      "merge_voice_gap_seconds": 0.5,
      "dump_srt": false,
      "state": "completed",
      "progress": 1
  "errors": [],
  "progress": 1,
  "duration": 51.79,
  "time_created": "2023-02-23 15:31:24.314000",
  "time_started": "2023-02-23 15:31:24.568000",
  "time_completed": "2023-02-23 15:32:16.358000",
  "media_type": "video",
  "result": {
    "summary": [
        "source": "",
        "media_type": "video",
        "info": null,
        "items": []
    "detailed_link": "",
    "summarized_link": ""

The above response tells you the state of the job.

In order to get the transcript with its time-codes you can request the /jobs/{JOB_ID}/detailed-results/ endpoint, the response looks like this:

  "total": 28,
  "offset": 0,
  "limit": 100,
  "next": "",
  "prev": "",
  "data": [
      "id": "7098553b-fe58-4785-90d6-f6101a141d32",
      "media_type": "audio",
      "frame_start": null,
      "frame_end": null,
      "source": "",
      "module": "speech_recognition",
      "meta": {
        "text": "Guten Tag, hier ist die Tagesschau in 100 Sekunden."
      "thumbnail": null,
      "detections": [],
      "time_start": 0,
      "time_end": 3,
      "tc_start": "00:00:00:00",
      "tc_end": "00:00:03:00"
      "id": "84b32442-d88f-40d1-b4c1-5bc364c15425",
      "media_type": "audio",
      "frame_start": null,
      "frame_end": null,
      "source": "",
      "module": "speech_recognition",
      "meta": {
        "text": "Habeck plant schärferes Vorgehen bei Verstößen gegen Russlandsanktionen."
      "thumbnail": null,
      "detections": [],
      "time_start": 3,
      "time_end": 8,
      "tc_start": "00:00:03:00",
      "tc_end": "00:00:08:00"
      "id": "cd7c1781-a54c-4c46-a939-f3a79d98238e",
      "media_type": "audio",
      "frame_start": null,
      "frame_end": null,
      "source": "",
      "module": "speech_recognition",
      "meta": {
        "text": "Der Druck auf Drittstaaten, die sanktionierte Güter aus Deutschland weiter nach Russland exportieren,"
      "thumbnail": null,
      "detections": [],
      "time_start": 8,
      "time_end": 12,
      "tc_start": "00:00:08:00",
      "tc_end": "00:00:12:00"
      "id": "b854a5db-f023-4c93-a18f-7795e67ced53",
      "media_type": "audio",
      "frame_start": null,
      "frame_end": null,
      "source": "",
      "module": "speech_recognition",
      "meta": {
        "text": "solle erhöht werden, so der Bundeswirtschaftsminister."
      "thumbnail": null,
      "detections": [],
      "time_start": 12,
      "time_end": 15,
      "tc_start": "00:00:12:00",
      "tc_end": "00:00:15:00"
      "id": "6d935513-4d77-4c2f-97d5-563f6e77e1db",
      "media_type": "audio",
      "frame_start": null,
      "frame_end": null,
      "source": "",
      "module": "speech_recognition",
      "meta": {
        "text": "Es gehe darum, die Umgehung von Sanktionen europaweit unter Strafe zu stellen."
      "thumbnail": null,
      "detections": [],
      "time_start": 15,
      "time_end": 20,
      "tc_start": "00:00:15:00",
      "tc_end": "00:00:20:00"
      "id": "d7df6ede-4e7d-4442-8ad6-85dd2fd99873",
      "media_type": "audio",
      "frame_start": null,
      "frame_end": null,
      "source": "",
      "module": "speech_recognition",
      "meta": {
        "text": "Bund-Länder treffen zur Neuordnung der Krankenhäuser."
      "thumbnail": null,
      "detections": [],
      "time_start": 20,
      "time_end": 23,
      "tc_start": "00:00:20:00",
      "tc_end": "00:00:23:00"
      "id": "ef61a4db-3bf8-48bf-8875-9fb4cc9f5856",
      "media_type": "audio",
      "frame_start": null,
      "frame_end": null,
      "source": "",
      "module": "speech_recognition",
      "meta": {
        "text": "Bundesgesundheitsminister Lauterbach hat die Bundesländer aufgerufen, in die Kliniken zu investieren."
      "thumbnail": null,
      "detections": [],
      "time_start": 23,
      "time_end": 29,
      "tc_start": "00:00:23:00",
      "tc_end": "00:00:29:00"
      "id": "8284b890-d601-40bb-8a19-e5c128fbc1ee",
      "media_type": "audio",
      "frame_start": null,
      "frame_end": null,
      "source": "",
      "module": "speech_recognition",
      "meta": {
        "text": "Er will unter anderem die Pauschalbeträge pro Patient- und Behandlungsfall absenken."
      "thumbnail": null,
      "detections": [],
      "time_start": 29,
      "time_end": 34,
      "tc_start": "00:00:29:00",
      "tc_end": "00:00:34:00"
      "id": "371c4ed3-798a-4593-b89c-342a6145c0a0",
      "media_type": "audio",
      "frame_start": null,
      "frame_end": null,
      "source": "",
      "module": "speech_recognition",
      "meta": {
        "text": "Im Gegenzug sollen die Krankenhäuser feste Beträge für das Vorhalten von Personal, einer Notaufnahme und Technik bekommen."
      "thumbnail": null,
      "detections": [],
      "time_start": 34,
      "time_end": 41,
      "tc_start": "00:00:34:00",
      "tc_end": "00:00:41:00"
      "id": "3d9455b2-4e4b-448a-88af-918ee51727b2",
      "media_type": "audio",
      "frame_start": null,
      "frame_end": null,
      "source": "",
      "module": "speech_recognition",
      "meta": {
        "text": "Einstellung des Falls Uri Djallo-Rechtens."
      "thumbnail": null,
      "detections": [],
      "time_start": 41,
      "time_end": 44,
      "tc_start": "00:00:41:00",
      "tc_end": "00:00:44:00"
      "id": "102fb035-8ad4-4a23-ae68-d25682600f2c",
      "media_type": "audio",
      "frame_start": null,
      "frame_end": null,
      "source": "",
      "module": "speech_recognition",
      "meta": {
        "text": "Das Bundesverfassungsgericht ließ eine Beschwerde gegen weitere Ermittlungen in dem Fall nicht zu."
      "thumbnail": null,
      "detections": [],
      "time_start": 44,
      "time_end": 49,
      "tc_start": "00:00:44:00",
      "tc_end": "00:00:49:00"
      "id": "3273aa99-5df7-413d-88aa-c1ea61d9a4ce",
      "media_type": "audio",
      "frame_start": null,
      "frame_end": null,
      "source": "",
      "module": "speech_recognition",
      "meta": {
        "text": "Vor 18 Jahren war der aus Westafrika stammende Uri Djallo gefesselt,"
      "thumbnail": null,
      "detections": [],
      "time_start": 49,
      "time_end": 53,
      "tc_start": "00:00:49:00",
      "tc_end": "00:00:53:00"
      "id": "3bca798a-89c8-47c6-83c2-e5ebcc90ac04",
      "media_type": "audio",
      "frame_start": null,
      "frame_end": null,
      "source": "",
      "module": "speech_recognition",
      "meta": {
        "text": "auf einer Matratze liegend in einem Gefängnis in Dessau verbrannt."
      "thumbnail": null,
      "detections": [],
      "time_start": 53,
      "time_end": 57,
      "tc_start": "00:00:53:00",
      "tc_end": "00:00:57:00"
      "id": "90daa8a3-a621-41b8-b7c2-d1c4830b0b1f",
      "media_type": "audio",
      "frame_start": null,
      "frame_end": null,
      "source": "",
      "module": "speech_recognition",
      "meta": {
        "text": "Die zwei bisherigen Gerichtsprozesse hätten dem Grundrecht auf Strafverfolgung Hinrechnung getragen, so das Verfassungsgericht."
      "thumbnail": null,
      "detections": [],
      "time_start": 57,
      "time_end": 65,
      "tc_start": "00:00:57:00",
      "tc_end": "00:01:05:00"
      "id": "876d874c-0f97-4ebe-a049-ede5d6422f70",
      "media_type": "audio",
      "frame_start": null,
      "frame_end": null,
      "source": "",
      "module": "speech_recognition",
      "meta": {
        "text": "Mehrere EU-Staaten wollen Verbot von PFAS-Chemikalien."
      "thumbnail": null,
      "detections": [],
      "time_start": 65,
      "time_end": 70,
      "tc_start": "00:01:05:00",
      "tc_end": "00:01:10:00"
      "id": "5135d1db-8ba5-464c-80b3-a74ecd308398",
      "media_type": "audio",
      "frame_start": null,
      "frame_end": null,
      "source": "",
      "module": "speech_recognition",
      "meta": {
        "text": "Nach Recherchen von NDR, WDR und Süddeutscher Zeitung sind die sogenannten Ewigkeitschemikalien,"
      "thumbnail": null,
      "detections": [],
      "time_start": 70,
      "time_end": 75,
      "tc_start": "00:01:10:00",
      "tc_end": "00:01:15:00"
      "id": "97285dfb-9f63-4902-986c-01fa51c6fd51",
      "media_type": "audio",
      "frame_start": null,
      "frame_end": null,
      "source": "",
      "module": "speech_recognition",
      "meta": {
        "text": "die zum Beispiel in Jacken, Löschschaum oder als Bratpfannenbeschichtung verwendet werden,"
      "thumbnail": null,
      "detections": [],
      "time_start": 75,
      "time_end": 80,
      "tc_start": "00:01:15:00",
      "tc_end": "00:01:20:00"
      "id": "63f104f7-c9be-41e4-a12f-1acec0dd7bc4",
      "media_type": "audio",
      "frame_start": null,
      "frame_end": null,
      "source": "",
      "module": "speech_recognition",
      "meta": {
        "text": "bereits vielerorts im Grundwasser nachweisbar."
      "thumbnail": null,
      "detections": [],
      "time_start": 80,
      "time_end": 83,
      "tc_start": "00:01:20:00",
      "tc_end": "00:01:23:00"
      "id": "d475bb7e-11f9-4cf1-9b70-9840a589d0ba",
      "media_type": "audio",
      "frame_start": null,
      "frame_end": null,
      "source": "",
      "module": "speech_recognition",
      "meta": {
        "text": "Da sie als krebserregend gelten, streben Deutschland und weitere EU-Staaten nun ein Verbot bis 2025 an,"
      "thumbnail": null,
      "detections": [],
      "time_start": 83,
      "time_end": 90,
      "tc_start": "00:01:23:00",
      "tc_end": "00:01:30:00"
      "id": "d130ac5d-4318-491a-86ec-f723e06ce544",
      "media_type": "audio",
      "frame_start": null,
      "frame_end": null,
      "source": "",
      "module": "speech_recognition",
      "meta": {
        "text": "so Bundesumweltministerin Lemke."
      "thumbnail": null,
      "detections": [],
      "time_start": 90,
      "time_end": 93,
      "tc_start": "00:01:30:00",
      "tc_end": "00:01:33:00"
      "id": "5aac6be9-1249-44b2-ad44-2aa9da08f1c5",
      "media_type": "audio",
      "frame_start": null,
      "frame_end": null,
      "source": "",
      "module": "speech_recognition",
      "meta": {
        "text": "Heftige Winterstürme in den USA."
      "thumbnail": null,
      "detections": [],
      "time_start": 93,
      "time_end": 95,
      "tc_start": "00:01:33:00",
      "tc_end": "00:01:35:00"
      "id": "f01fe052-ab6e-4252-a9f9-6c08374831a4",
      "media_type": "audio",
      "frame_start": null,
      "frame_end": null,
      "source": "",
      "module": "speech_recognition",
      "meta": {
        "text": "Vor allem im Norden des Landes kam es wegen Schneefällen und umgestürzter Bäume zu Verkehrsbehinderungen."
      "thumbnail": null,
      "detections": [],
      "time_start": 95,
      "time_end": 100,
      "tc_start": "00:01:35:00",
      "tc_end": "00:01:40:00"
      "id": "c627cda4-3b09-4925-86fd-ddc0b45a7657",
      "media_type": "audio",
      "frame_start": null,
      "frame_end": null,
      "source": "",
      "module": "speech_recognition",
      "meta": {
        "text": "Schulen und Flughäfen blieben geschlossen."
      "thumbnail": null,
      "detections": [],
      "time_start": 100,
      "time_end": 103,
      "tc_start": "00:01:40:00",
      "tc_end": "00:01:43:00"
      "id": "c4bc05f0-fb04-495b-b2ad-8486e96c179e",
      "media_type": "audio",
      "frame_start": null,
      "frame_end": null,
      "source": "",
      "module": "speech_recognition",
      "meta": {
        "text": "Tausende Haushalte waren ohne Strom."
      "thumbnail": null,
      "detections": [],
      "time_start": 103,
      "time_end": 105,
      "tc_start": "00:01:43:00",
      "tc_end": "00:01:45:00"
      "id": "0c32b710-2583-4d19-ab87-b9ffdd98abfa",
      "media_type": "audio",
      "frame_start": null,
      "frame_end": null,
      "source": "",
      "module": "speech_recognition",
      "meta": {
        "text": "Auch für eigentlich sonnige Regionen in Kalifornien wurden Schneewarnungen ausgegeben."
      "thumbnail": null,
      "detections": [],
      "time_start": 105,
      "time_end": 111,
      "tc_start": "00:01:45:00",
      "tc_end": "00:01:51:00"
      "id": "34de3de4-2d58-4e1e-8771-9b1e122c07f0",
      "media_type": "audio",
      "frame_start": null,
      "frame_end": null,
      "source": "",
      "module": "speech_recognition",
      "meta": {
        "text": "Meteorologen rechnen auch in den kommenden Tagen nicht mit einer Wetterbesserung."
      "thumbnail": null,
      "detections": [],
      "time_start": 111,
      "time_end": 116,
      "tc_start": "00:01:51:00",
      "tc_end": "00:01:56:00"
      "id": "6ebf9060-1663-45f7-9b1c-1f1a5c86640c",
      "media_type": "audio",
      "frame_start": null,
      "frame_end": null,
      "source": "",
      "module": "speech_recognition",
      "meta": {
        "text": "Das Wetter morgen im Norden und Westen. Dichte Wolken mit Regen und teilweise Schneeregen."
      "thumbnail": null,
      "detections": [],
      "time_start": 116,
      "time_end": 121,
      "tc_start": "00:01:56:00",
      "tc_end": "00:02:01:00"
      "id": "cd4b3f41-3952-457a-ae71-e8f8f1a149fb",
      "media_type": "audio",
      "frame_start": null,
      "frame_end": null,
      "source": "",
      "module": "speech_recognition",
      "meta": {
        "text": "Im Süden erst Sonne, später Regen. 3 bis 15 Grad."
      "thumbnail": null,
      "detections": [],
      "time_start": 121,
      "time_end": 150,
      "tc_start": "00:02:01:00",
      "tc_end": "00:02:30:00"