From 6289e7f40183d6705a97217f4d942e7ae5603163 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Kwiecie=C5=84?= Date: Sun, 14 Apr 2024 15:59:29 +0200 Subject: [PATCH] Use stdin instead of a server --- internal/api/handler.go | 28 ++++ internal/resources/cli_arguments.go | 14 ++ main.go | 63 +++++---- pkg/whisper/FullParams.go | 4 +- pkg/whisper/language.go | 207 ---------------------------- 5 files changed, 84 insertions(+), 232 deletions(-) delete mode 100644 pkg/whisper/language.go diff --git a/internal/api/handler.go b/internal/api/handler.go index 8ae58f4..57db25b 100644 --- a/internal/api/handler.go +++ b/internal/api/handler.go @@ -50,6 +50,34 @@ func TranscribeFromFile(c echo.Context, whisperState *WhisperState) error { return c.JSON(http.StatusOK, response) } +func TranscribeBytes(buffer []byte, whisperState *WhisperState) string { + whisperState.mutex.Lock() + defer whisperState.mutex.Unlock() + + bufferSpecial, err := whisperState.media.LoadAudioFileData(&buffer, true) + + if err != nil { + println("Error loading audio file data: ", err) + // return err + } + + err = whisperState.context.RunStreamed(whisperState.params, bufferSpecial) + if err != nil { + println("Error processing audio: ", err) + // return err + } + + result, err := getResult(whisperState.context) + if err != nil { + println("Error getting result: ", err) + // return err + } + + trimed := strings.TrimLeft(result, " ") + return trimed + +} + func Transcribe(c echo.Context, whisperState *WhisperState) error { // Get the file header fileHeader, err := c.FormFile("file") diff --git a/internal/resources/cli_arguments.go b/internal/resources/cli_arguments.go index 3a29a48..01d311a 100644 --- a/internal/resources/cli_arguments.go +++ b/internal/resources/cli_arguments.go @@ -20,6 +20,7 @@ type ParsedArguments struct { Language int32 ModelPath string Port int + // Buffer []byte } type LanguageMap map[string]string @@ -64,6 +65,8 @@ func ParseFlags() (*ParsedArguments, error) { flag.StringVar(&args.Language, "language", "", "Language to be processed") // Optional: Redundant to demonstrate flag.StringVar(&args.ModelPath, "m", "", "Path to the model file (required)") flag.StringVar(&args.ModelPath, "modelPath", "", "Path to the model file (required)") // Optional: Redundant + // bufferArg := flag.String("buffer", "", "Base64-encoded buffer data") + flag.IntVar(&args.Port, "p", 3031, "Port to start the server on") flag.IntVar(&args.Port, "port", 3031, "Port to start the server on") // Optional: Redundant @@ -76,6 +79,16 @@ func ParseFlags() (*ParsedArguments, error) { // Parsing flags flag.Parse() + // if *bufferArg != "" { + // decodedBuffer, err := base64.StdEncoding.DecodeString(*bufferArg) + // if err != nil { + // fmt.Println("Error decoding buffer:", err) + // return nil, err + // } + // // Process the decoded buffer (e.g., print its contents) + // fmt.Println("Decoded Buffer:", string(decodedBuffer)) + // } + args.Language = strings.ToLower(args.Language) if args.ModelPath == "" { @@ -92,5 +105,6 @@ func ParseFlags() (*ParsedArguments, error) { Language: languageCode, ModelPath: args.ModelPath, Port: args.Port, + // Buffer: []byte(*bufferArg), }, nil } diff --git a/main.go b/main.go index 5465df9..b9eee5b 100644 --- a/main.go +++ b/main.go @@ -1,21 +1,24 @@ package main import ( + "bufio" + "encoding/base64" "fmt" "os" "path/filepath" - "github.com/labstack/echo/v4" - "github.com/labstack/echo/v4/middleware" - "github.com/labstack/gommon/log" "github.com/xzeldon/whisper-api-server/internal/api" "github.com/xzeldon/whisper-api-server/internal/resources" ) -func change_working_directory(e *echo.Echo) { +// begin delimiter const +const beginDelimiter = "[begin]" +const endDelimiter = "[end]" + +func change_working_directory() { exePath, errs := os.Executable() if errs != nil { - e.Logger.Error(errs) + println("Error getting executable path") return } @@ -24,7 +27,7 @@ func change_working_directory(e *echo.Echo) { // Change the working directory to the executable directory errs = os.Chdir(exeDir) if errs != nil { - e.Logger.Error(errs) + println("Error changing working directory") return } @@ -34,32 +37,46 @@ func change_working_directory(e *echo.Echo) { func main() { - e := echo.New() - e.HideBanner = true - change_working_directory(e) + change_working_directory() args, errParsing := resources.ParseFlags() if errParsing != nil { - e.Logger.Error("Error parsing flags: ", errParsing) + println("Error parsing flags: ", errParsing) return } - e.Use(middleware.CORS()) - - if l, ok := e.Logger.(*log.Logger); ok { - l.SetHeader("${time_rfc3339} ${level}") - } - whisperState, err := api.InitializeWhisperState(args.ModelPath, args.Language) if err != nil { - e.Logger.Error(err) + println("Error initializing whisper state: ", err) + } + const maxCapacity = 2048 * 10240 + + scanner := bufio.NewScanner(os.Stdin) + buf := make([]byte, maxCapacity) + scanner.Buffer(buf, maxCapacity) + + println("waiting_for_input") + if scanner.Scan() { + base64Data := scanner.Text() + decodedBuffer, err := base64.StdEncoding.DecodeString(base64Data) + if err != nil { + fmt.Println("Error decoding buffer:", err) + return + } + + result := api.TranscribeBytes(decodedBuffer, whisperState) + println(beginDelimiter + result + endDelimiter) + println("finished") + + // Process the decodedBuffer (e.g., print its length) + fmt.Println("Received buffer size:", len(decodedBuffer)) + + // Send a response back to Node.js (optional) + fmt.Fprintln(os.Stdout, "Buffer received successfully!") + } else if err := scanner.Err(); err != nil { + fmt.Println("Error reading from stdin:", err) } - e.POST("/v1/audio/transcriptions", func(c echo.Context) error { - - return api.Transcribe(c, whisperState) - }) - - e.Logger.Fatal(e.Start(fmt.Sprintf("127.0.0.1:%d", args.Port))) + // e.Logger.Fatal(e.Start(fmt.Sprintf("127.0.0.1:%d", args.Port))) } diff --git a/pkg/whisper/FullParams.go b/pkg/whisper/FullParams.go index 4bbdd0a..98dfd4d 100644 --- a/pkg/whisper/FullParams.go +++ b/pkg/whisper/FullParams.go @@ -93,7 +93,7 @@ func (this *FullParams) SetLanguage(language int32) { return } - this.cStruct.Language = eLanguage(language) + this.cStruct.Language = language } /*using pfnNewSegment = HRESULT( __cdecl* )( iContext* ctx, uint32_t n_new, void* user_data ) noexcept;*/ @@ -178,7 +178,7 @@ type _FullParams struct { offset_ms int32 duration_ms int32 Flags eFullParamsFlags - Language eLanguage + Language int32 thold_pt float32 thold_ptsum float32 diff --git a/pkg/whisper/language.go b/pkg/whisper/language.go deleted file mode 100644 index 4e6a042..0000000 --- a/pkg/whisper/language.go +++ /dev/null @@ -1,207 +0,0 @@ -package whisper - -// https://github.com/Const-me/Whisper/blob/master/WhisperNet/API/eLanguage.cs - -type eLanguage int32 - -const ( - Auto eLanguage = -1 // "af" - - Afrikaans = 0x6661 // "af" - /// Albanian - Albanian = 0x7173 // "sq" - /// Amharic - Amharic = 0x6D61 // "am" - /// Arabic - Arabic = 0x7261 // "ar" - /// Armenian - Armenian = 0x7968 // "hy" - /// Assamese - Assamese = 0x7361 // "as" - /// Azerbaijani - Azerbaijani = 0x7A61 // "az" - /// Bashkir - Bashkir = 0x6162 // "ba" - /// Basque - Basque = 0x7565 // "eu" - /// Belarusian - Belarusian = 0x6562 // "be" - /// Bengali - Bengali = 0x6E62 // "bn" - /// Bosnian - Bosnian = 0x7362 // "bs" - /// Breton - Breton = 0x7262 // "br" - /// Bulgarian - Bulgarian = 0x6762 // "bg" - /// Catalan - Catalan = 0x6163 // "ca" - /// Chinese - Chinese = 0x687A // "zh" - /// Croatian - Croatian = 0x7268 // "hr" - /// Czech - Czech = 0x7363 // "cs" - /// Danish - Danish = 0x6164 // "da" - /// Dutch - Dutch = 0x6C6E // "nl" - /// English - English = 0x6E65 // "en" - /// Estonian - Estonian = 0x7465 // "et" - /// Faroese - Faroese = 0x6F66 // "fo" - /// Finnish - Finnish = 0x6966 // "fi" - /// French - French = 0x7266 // "fr" - /// Galician - Galician = 0x6C67 // "gl" - /// Georgian - Georgian = 0x616B // "ka" - /// German - German = 0x6564 // "de" - /// Greek - Greek = 0x6C65 // "el" - /// Gujarati - Gujarati = 0x7567 // "gu" - /// Haitian Creole - HaitianCreole = 0x7468 // "ht" - /// Hausa - Hausa = 0x6168 // "ha" - /// Hawaiian - Hawaiian = 0x776168 // "haw" - /// Hebrew - Hebrew = 0x7769 // "iw" - /// Hindi - Hindi = 0x6968 // "hi" - /// Hungarian - Hungarian = 0x7568 // "hu" - /// Icelandic - Icelandic = 0x7369 // "is" - /// Indonesian - Indonesian = 0x6469 // "id" - /// Italian - Italian = 0x7469 // "it" - /// Japanese - Japanese = 0x616A // "ja" - /// Javanese - Javanese = 0x776A // "jw" - /// Kannada - Kannada = 0x6E6B // "kn" - /// Kazakh - Kazakh = 0x6B6B // "kk" - /// Khmer - Khmer = 0x6D6B // "km" - /// Korean - Korean = 0x6F6B // "ko" - /// Lao - Lao = 0x6F6C // "lo" - /// Latin - Latin = 0x616C // "la" - /// Latvian - Latvian = 0x766C // "lv" - /// Lingala - Lingala = 0x6E6C // "ln" - /// Lithuanian - Lithuanian = 0x746C // "lt" - /// Luxembourgish - Luxembourgish = 0x626C // "lb" - /// Macedonian - Macedonian = 0x6B6D // "mk" - /// Malagasy - Malagasy = 0x676D // "mg" - /// Malay - Malay = 0x736D // "ms" - /// Malayalam - Malayalam = 0x6C6D // "ml" - /// Maltese - Maltese = 0x746D // "mt" - /// Maori - Maori = 0x696D // "mi" - /// Marathi - Marathi = 0x726D // "mr" - /// Mongolian - Mongolian = 0x6E6D // "mn" - /// Myanmar - Myanmar = 0x796D // "my" - /// Nepali - Nepali = 0x656E // "ne" - /// Norwegian - Norwegian = 0x6F6E // "no" - /// Nynorsk - Nynorsk = 0x6E6E // "nn" - /// Occitan - Occitan = 0x636F // "oc" - /// Pashto - Pashto = 0x7370 // "ps" - /// Persian - Persian = 0x6166 // "fa" - /// Polish - Polish = 0x6C70 // "pl" - /// Portuguese - Portuguese = 0x7470 // "pt" - /// Punjabi - Punjabi = 0x6170 // "pa" - /// Romanian - Romanian = 0x6F72 // "ro" - /// Russian - Russian = 0x7572 // "ru" - /// Sanskrit - Sanskrit = 0x6173 // "sa" - /// Serbian - Serbian = 0x7273 // "sr" - /// Shona - Shona = 0x6E73 // "sn" - /// Sindhi - Sindhi = 0x6473 // "sd" - /// Sinhala - Sinhala = 0x6973 // "si" - /// Slovak - Slovak = 0x6B73 // "sk" - /// Slovenian - Slovenian = 0x6C73 // "sl" - /// Somali - Somali = 0x6F73 // "so" - /// Spanish - Spanish = 0x7365 // "es" - /// Sundanese - Sundanese = 0x7573 // "su" - /// Swahili - Swahili = 0x7773 // "sw" - /// Swedish - Swedish = 0x7673 // "sv" - /// Tagalog - Tagalog = 0x6C74 // "tl" - /// Tajik - Tajik = 0x6774 // "tg" - /// Tamil - Tamil = 0x6174 // "ta" - /// Tatar - Tatar = 0x7474 // "tt" - /// Telugu - Telugu = 0x6574 // "te" - /// Thai - Thai = 0x6874 // "th" - /// Tibetan - Tibetan = 0x6F62 // "bo" - /// Turkish - Turkish = 0x7274 // "tr" - /// Turkmen - Turkmen = 0x6B74 // "tk" - /// Ukrainian - Ukrainian = 0x6B75 // "uk" - /// Urdu - Urdu = 0x7275 // "ur" - /// Uzbek - Uzbek = 0x7A75 // "uz" - /// Vietnamese - Vietnamese = 0x6976 // "vi" - /// Welsh - Welsh = 0x7963 // "cy" - /// Yiddish - Yiddish = 0x6979 // "yi" - /// Yoruba - Yoruba = 0x6F79 // "yo" -)