From c7b40276cfa2a5b68dd35c5f59d132161c2f59ee Mon Sep 17 00:00:00 2001 From: Matte23 Date: Sat, 9 Nov 2024 22:57:47 +0100 Subject: [PATCH] Add base code --- .gitignore | 5 + LICENSE | 0 cmd/process.go | 74 +++++++++ cmd/root.go | 52 +++++++ go.mod | 14 ++ go.sum | 67 ++++++++ internal/llm/paragraph_detector.go | 234 ++++++++++++++++++++++++++++ internal/llm/structure_builder.go | 199 +++++++++++++++++++++++ internal/llm/text_rewriter.go | 99 ++++++++++++ internal/llm/text_rewriter_large.go | 86 ++++++++++ internal/stt/whisper.go | 68 ++++++++ internal/utils/utils.go | 34 ++++ internal/video/audio_extractor.go | 32 ++++ main.go | 11 ++ 14 files changed, 975 insertions(+) create mode 100644 LICENSE create mode 100644 cmd/process.go create mode 100644 cmd/root.go create mode 100644 go.mod create mode 100644 go.sum create mode 100644 internal/llm/paragraph_detector.go create mode 100644 internal/llm/structure_builder.go create mode 100644 internal/llm/text_rewriter.go create mode 100644 internal/llm/text_rewriter_large.go create mode 100644 internal/stt/whisper.go create mode 100644 internal/utils/utils.go create mode 100644 internal/video/audio_extractor.go create mode 100644 main.go diff --git a/.gitignore b/.gitignore index adf8f72..408b8d8 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,8 @@ # Go workspace file go.work +# Output files +output/ + +# Input videos +*.mp4 diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..e69de29 diff --git a/cmd/process.go b/cmd/process.go new file mode 100644 index 0000000..b46ae3d --- /dev/null +++ b/cmd/process.go @@ -0,0 +1,74 @@ +/* +Copyright © 2024 Matteo Schiff + +*/ + +package cmd + +import ( + "log" + "strings" + + "git.underdesk.net/Matte23/transcriber/internal/llm" + "git.underdesk.net/Matte23/transcriber/internal/stt" + "git.underdesk.net/Matte23/transcriber/internal/video" + "github.com/sashabaranov/go-openai" + "github.com/spf13/cobra" +) + +// processCmd represents the process command +var processCmd = &cobra.Command{ + Use: "process", + Short: "Process video file by generating text notes", + Long: `A longer description that spans multiple lines and likely contains examples +and usage of using your command. For example: + +Cobra is a CLI library for Go that empowers applications. +This application is a tool to generate the needed files +to quickly create a Cobra application.`, + Run: func(cmd *cobra.Command, args []string) { + inputFile := cmd.Flag("input").Value.String() + filename, _, _ := strings.Cut(inputFile, ".") + + audioFile := "./output/" + filename + ".wav" + transcriptionFile := "./output/" + filename + ".transcription.txt" + splitFile := "./output/" + filename + ".split.json" + finalFile := "./output/" + filename + ".md" + + config := openai.DefaultConfig("") + config.BaseURL = "http://192.168.1.111:8080/v1" + client := openai.NewClientWithConfig(config) + + log.Println("Starting video to audio conversion") + video.ExtractVideo(inputFile, audioFile) + log.Println("Conversion terminated") + + transcriber := stt.NewWhisperTranscriver(client) + log.Println("Starting audio transcription") + transcription, _ := transcriber.Transcribe(audioFile, transcriptionFile) + log.Println("Audio transcription terminated") + + splitter := llm.NewStructureBuilder(client) + log.Println("Starting splitting transcription") + paragraphs, _ := splitter.Split(splitFile, transcription) + log.Println("Transcription splitted") + + llm.RewriteText(client, paragraphs, finalFile) + + log.Println("Text rewrite completed!") + }, +} + +func init() { + rootCmd.AddCommand(processCmd) + + // Here you will define your flags and configuration settings. + + // Cobra supports Persistent Flags which will work for this command + // and all subcommands, e.g.: + // processCmd.PersistentFlags().String("foo", "", "A help for foo") + + // Cobra supports local flags which will only run when this command + // is called directly, e.g.: + processCmd.Flags().StringP("input", "i", "./input.mp4", "Input video to be processed") +} diff --git a/cmd/root.go b/cmd/root.go new file mode 100644 index 0000000..7bf53ba --- /dev/null +++ b/cmd/root.go @@ -0,0 +1,52 @@ +/* +Copyright © 2024 Matteo Schiff + +*/ + +package cmd + +import ( + "os" + + "github.com/spf13/cobra" +) + + + +// rootCmd represents the base command when called without any subcommands +var rootCmd = &cobra.Command{ + Use: "transcriber", + Short: "A brief description of your application", + Long: `A longer description that spans multiple lines and likely contains +examples and usage of using your application. For example: + +Cobra is a CLI library for Go that empowers applications. +This application is a tool to generate the needed files +to quickly create a Cobra application.`, + // Uncomment the following line if your bare application + // has an action associated with it: + // Run: func(cmd *cobra.Command, args []string) { }, +} + +// Execute adds all child commands to the root command and sets flags appropriately. +// This is called by main.main(). It only needs to happen once to the rootCmd. +func Execute() { + err := rootCmd.Execute() + if err != nil { + os.Exit(1) + } +} + +func init() { + // Here you will define your flags and configuration settings. + // Cobra supports persistent flags, which, if defined here, + // will be global for your application. + + // rootCmd.PersistentFlags().StringVar(&cfgFile, "config", "", "config file (default is $HOME/.transcriber.yaml)") + + // Cobra also supports local flags, which will only run + // when this action is called directly. + rootCmd.Flags().BoolP("toggle", "t", false, "Help message for toggle") +} + + diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..53790d1 --- /dev/null +++ b/go.mod @@ -0,0 +1,14 @@ +module git.underdesk.net/Matte23/transcriber + +go 1.22.7 + +require ( + github.com/aws/aws-sdk-go v1.55.5 // indirect + github.com/inconshreveable/mousetrap v1.1.0 // indirect + github.com/jmespath/go-jmespath v0.4.0 // indirect + github.com/sashabaranov/go-openai v1.32.3 // indirect + github.com/spf13/cobra v1.8.1 // indirect + github.com/spf13/pflag v1.0.5 // indirect + github.com/u2takey/ffmpeg-go v0.5.0 // indirect + github.com/u2takey/go-utils v0.3.1 // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..d79f48c --- /dev/null +++ b/go.sum @@ -0,0 +1,67 @@ +github.com/aws/aws-sdk-go v1.38.20/go.mod h1:hcU610XS61/+aQV88ixoOzUoG7v3b31pl2zKMmprdro= +github.com/aws/aws-sdk-go v1.55.5 h1:KKUZBfBoyqy5d3swXyiC7Q76ic40rYcbqH7qjh59kzU= +github.com/aws/aws-sdk-go v1.55.5/go.mod h1:eRwEWoyTWFMVYVQzKMNHWP5/RV4xIUGMQfXQHfHkpNU= +github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/disintegration/imaging v1.6.2/go.mod h1:44/5580QXChDfwIclfc/PCwrr44amcmDAg8hxG0Ewe4= +github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= +github.com/go-logr/logr v0.1.0/go.mod h1:ixOQHD9gLJUVQQ2ZOR7zLEifBX6tGkNJF4QyIY7sIas= +github.com/gogo/protobuf v1.3.1/go.mod h1:SlYgWuQ5SjCEi6WLHjHCa1yvBfUnHcTbrrZtXPKa29o= +github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/uuid v1.1.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/hashicorp/golang-lru v0.5.4/go.mod h1:iADmTwqILo4mZ8BN3D2Q6+9jd8WM5uGBxy+E8yxSoD4= +github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= +github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= +github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg= +github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo= +github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U= +github.com/json-iterator/go v1.1.10/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4= +github.com/kisielk/errcheck v1.2.0/go.mod h1:/BMXB+zMLi60iA8Vv6Ksmxu/1UDYcXs4uQLJ+jE2L00= +github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= +github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= +github.com/panjf2000/ants/v2 v2.4.2/go.mod h1:f6F0NZVFsGCp5A7QW/Zj/m92atWwOkY0OIhFxRNFr4A= +github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= +github.com/sashabaranov/go-openai v1.32.3 h1:6xZ393PbZFoJrgwveBXVZggmyH7zdp4joUdnCy7FFD8= +github.com/sashabaranov/go-openai v1.32.3/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg= +github.com/spf13/afero v1.2.2/go.mod h1:9ZxEEn6pIJ8Rxe320qSDBk6AsU0r9pR7Q4OcevTdifk= +github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM= +github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y= +github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= +github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= +github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA= +github.com/u2takey/ffmpeg-go v0.5.0 h1:r7d86XuL7uLWJ5mzSeQ03uvjfIhiJYvsRAJFCW4uklU= +github.com/u2takey/ffmpeg-go v0.5.0/go.mod h1:ruZWkvC1FEiUNjmROowOAps3ZcWxEiOpFoHCvk97kGc= +github.com/u2takey/go-utils v0.3.1 h1:TaQTgmEZZeDHQFYfd+AdUT1cT4QJgJn/XVPELhHw4ys= +github.com/u2takey/go-utils v0.3.1/go.mod h1:6e+v5vEZ/6gu12w/DC2ixZdZtCrNokVxD0JUklcqdCs= +gocv.io/x/gocv v0.25.0/go.mod h1:Rar2PS6DV+T4FL+PM535EImD/h13hGVaHhnCu1xarBs= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/image v0.0.0-20191009234506-e7c1f5e7dbb8/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20201110031124-69a78807bb2b/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200602225109-6fdc65e7d980/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20181030221726-6c7e314b6563/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.2.7/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +sigs.k8s.io/yaml v1.2.0/go.mod h1:yfXDCHCao9+ENCvLSE62v9VSji2MKu5jeNfTrofGhJc= diff --git a/internal/llm/paragraph_detector.go b/internal/llm/paragraph_detector.go new file mode 100644 index 0000000..55977f9 --- /dev/null +++ b/internal/llm/paragraph_detector.go @@ -0,0 +1,234 @@ +/* +Copyright © 2024 Matteo Schiff + +*/ + +package llm + +import ( + "context" + "encoding/json" + "fmt" + "os" + "sort" + "strconv" + "strings" + + "github.com/sashabaranov/go-openai" +) + +type ParagraphDetector struct { + client *openai.Client + useCache bool +} + +func NewParagraphDetector(client *openai.Client) *ParagraphDetector { + return &ParagraphDetector{ + client: client, + useCache: true, + } +} + +func (wt *ParagraphDetector) cacheFileName(inputAudio string) string { + return inputAudio + ".splitted.txt" +} + +func (wt *ParagraphDetector) loadCache(name string) ([]int, error) { + contentJson, err := os.ReadFile(wt.cacheFileName(name)) + + var content []int + err = json.Unmarshal(contentJson, &content) + + return content, err +} + +func (wt *ParagraphDetector) saveCache(name string, content []int) error { + contentJson, _ := json.Marshal(content) + + err := os.WriteFile(wt.cacheFileName(name), contentJson, 0666) + + return err +} + +type SplitResponse struct { + Topics map[string]string `json:"Topics"` +} + +func (s SplitResponse) MarshalJSON() ([]byte, error) { + return json.Marshal(struct{ + Topics map[string]string `json:"Topics"` + }{ + Topics: s.Topics, + }) +} + +func (wt *ParagraphDetector) splitUsingLLM(transcriptionSentences []string) []int { + systemPrompt := openai.ChatCompletionMessage{ + Role: openai.ChatMessageRoleSystem, + //Content: "Identify topics in the following transcription. Topics are a group of sentences about the same argument. Format the output using a JSON integer array, where each item is the line number (integer). Each time you see one or two new topics, add the integer to the array.", + Content: ` +You are given a transcription of a conversation or speech. Your task is to identify where new topics or arguments begin within the text. A new topic is marked by a shift in subject, a new argument being introduced, or a clear change in focus. Additionally, ensure that each identified segment (paragraph) contains more than just a few sentences. If necessary, group smaller sections together so that each segment is substantial. + +Input: The transcription will be provided as a series of lines. Each line is numbered for easy reference. Your task is to read through the transcription and identify the points where a new topic or argument begins. If a segment is too short (fewer than three sentences), it should be grouped with an adjacent segment. For each identified new topic, write a brief title or summary describing it. + +Output: The output should be formatted as a JSON object, where each key is the line number where a new topic starts, and the corresponding value is the title of that topic. For example: + +json + +{ + "0": "Title of First Topic", + "19": "Title of Second Topic" +} + +Example: + +Transcription: + +vbnet + +0: Hello, everyone. Today we're going to talk about climate change. +1: It's a complex issue, but I'll try to break it down. +2: Climate change refers to long-term shifts in temperatures and weather patterns. +3: In this discussion, we will cover the causes, effects, and possible solutions. +4: First, let's discuss the causes of climate change. +5: There are several factors, including greenhouse gas emissions. +6: Most emissions come from burning fossil fuels like coal, oil, and gas. +7: Another key cause is deforestation, which reduces the number of trees that can absorb CO2. +8: Deforestation not only affects CO2 levels but also disrupts ecosystems. +9: Next, let's move on to the effects of climate change. +10: Rising temperatures are one of the most obvious effects. +11: This leads to melting ice caps, rising sea levels, and extreme weather events. +12: We are already seeing more frequent and intense heatwaves, hurricanes, and floods. +13: The impact on wildlife is also severe, with many species facing habitat loss. +14: Finally, what can we do to address this issue? +15: One solution is to reduce our carbon footprint by using energy more efficiently. +16: Renewable energy sources, like wind and solar, play a big role here. +17: Governments and organizations worldwide are investing in clean energy technologies. +18: Individual actions, such as reducing waste and conserving water, also make a difference. +19: That's all for today's discussion on climate change. Thank you for listening. + +Expected Output: + +json + +{ + "0": "Introduction to Climate Change", + "4": "Causes of Climate Change", + "9": "Effects of Climate Change", + "14": "Solutions to Climate Change" +} + +Instruction: Please read the transcription carefully and list the line numbers where each new topic or argument starts, along with a brief title summarizing that topic. Ensure that shorter segments are grouped with neighboring content to form more substantial paragraphs before marking the start of a new topic.`, + } + + currentStart := 0 + blockSize := 200 + + var startLines []int + + for currentStart < len(transcriptionSentences) { + if currentStart > len(transcriptionSentences) { + break + } + + blockEnd := currentStart+blockSize + + if blockEnd >= len(transcriptionSentences) { + blockEnd = len(transcriptionSentences) - 1 + } + + currentSlice := transcriptionSentences[currentStart : blockEnd] + + userPromptText := "" + for i, sentence := range currentSlice { + userPromptText += fmt.Sprintf("%d: %s\n", i, sentence) + } + + userPrompt := openai.ChatCompletionMessage{ + Role: openai.ChatMessageRoleUser, + Content: userPromptText, + } + + resp1, err := wt.client.CreateChatCompletion( + context.Background(), + openai.ChatCompletionRequest{ + Model: "mistral-7b-instruct-v0.3", + Messages: []openai.ChatCompletionMessage{systemPrompt, userPrompt}, + ResponseFormat: &openai.ChatCompletionResponseFormat{ + Type: openai.ChatCompletionResponseFormatTypeJSONObject, + JSONSchema: &openai.ChatCompletionResponseFormatJSONSchema{ + Name: "splitter", + Strict: true, + Schema: SplitResponse{}, + }, + }, + }, + ) + + if err != nil { + fmt.Printf("Splitting error: %v\n", err) + return []int{} + } + fmt.Printf(resp1.Choices[0].Message.Content) + var data map[string]string + json.Unmarshal([]byte(resp1.Choices[0].Message.Content), &data) + //startLinesRaw = append(startLinesRaw, strings.Split(resp1.Choices[0].Message.Content, ",")...) + + for k, _ := range data { + before, _, _ := strings.Cut(k, "-") + q, err := strconv.Atoi(before) + if err != nil { + continue + } + startLines = append(startLines, currentStart+q) + } + + fmt.Println(startLines) + + currentStart += blockSize + } + /*fmt.Println(startLinesRaw) + + var err error + for i, n := range startLinesRaw { + startLines[i], err = strconv.Atoi(n) + + if err != nil { + return nil + } + }*/ + sort.Ints(startLines) + return startLines +} + +func (wt *ParagraphDetector) Split(name string, transcription string) ([]string, error) { + transcriptionSentences := strings.Split(transcription, ".") + + cache, err := wt.loadCache(name) + + var startLines []int + if wt.useCache && err == nil { + startLines = cache + } else { + startLines = wt.splitUsingLLM(transcriptionSentences) + wt.saveCache(name, startLines) + } + + var splittedTranscription []string + for i := range startLines { + q := len(transcriptionSentences) - 1 + + if i < len(startLines) - 1 { + q = startLines[i+1] + } + + var currentParagraph string + for j := startLines[i]; j <= q; j++ { + currentParagraph += transcriptionSentences[j] + "." + } + splittedTranscription = append(splittedTranscription, currentParagraph) + } + + + return splittedTranscription, nil +} diff --git a/internal/llm/structure_builder.go b/internal/llm/structure_builder.go new file mode 100644 index 0000000..edaf54e --- /dev/null +++ b/internal/llm/structure_builder.go @@ -0,0 +1,199 @@ +/* +Copyright © 2024 Matteo Schiff + +*/ + +package llm + +import ( + "context" + "encoding/json" + "fmt" + "os" + "sort" + "strconv" + "strings" + + "github.com/sashabaranov/go-openai" +) + +type StructureBuilder struct { + client *openai.Client + useCache bool +} + +func NewStructureBuilder(client *openai.Client) *StructureBuilder { + return &StructureBuilder{ + client: client, + useCache: true, + } +} + +func (wt *StructureBuilder) cacheFileName(inputAudio string) string { + return inputAudio + ".splitted.txt" +} + +func (wt *StructureBuilder) loadCache(name string) (map[string]ParagraphItem, error) { + contentJson, err := os.ReadFile(name) + + var content map[string]ParagraphItem + err = json.Unmarshal(contentJson, &content) + + return content, err +} + +func (wt *StructureBuilder) saveCache(name string, content map[string]ParagraphItem) error { + contentJson, _ := json.Marshal(content) + + err := os.WriteFile(name, contentJson, 0666) + + return err +} + +type ParagraphItem struct { + Title string + Type string + Content string `json:"-"` +} + +func (wt *StructureBuilder) splitUsingLLM(transcriptionSentences []string) map[string]ParagraphItem { + systemPrompt := openai.ChatCompletionMessage{ + Role: openai.ChatMessageRoleSystem, + Content: ` +You are given a transcription of a conversation or speech. Your task is to identify where new topics or arguments begin within the text. A new topic is marked by a shift in subject, a new argument being introduced, or a clear change in focus. Additionally, ensure that each identified segment (paragraph) contains more than just a few sentences. +For each identified new topic, write a brief title or summary describing it. Additionally, assign a heading type based on the importance or hierarchy of the topic. Use the following rules for heading types: + + "heading1" for main sections or major topics. + "heading2" for subtopics or important subsections within a main topic. + "heading3" for smaller, more detailed sections within a subtopic. + +Input: The transcription will be provided as a series of lines. Each line is numbered for easy reference. Your task is to read through the transcription and identify the points where a new topic or argument begins. + +The previous output will be provided as input to guide you on where new sections begin and their existing structure. Use this to extend the structure as needed. +Output: The output should be formatted as a JSON object, where each key is the line number where a new topic starts, and the corresponding value is an object containing the title of that topic and the heading type. For example: +Example: +{ + "0": { + "Title": "Example title", + "Type": "heading1" + }, + "4": { + "Title": "Example subtitle", + "Type": "heading2" + }, + "9": { + "Title": "Another example subtitle", + "Type": "heading2" + }, + "14": { + "Title": "A third example subtitle", + "Type": "heading2" + } +} + +Instruction: Please read the transcription carefully and list the line numbers where each new topic or argument starts, along with a brief title summarizing that topic. Use the previous output to guide your extensions, adding titles and types ("heading1", "heading2", "heading3") as necessary.`, + } + + currentStart := 0 + blockSize := 200 + + currentStructure := map[string]ParagraphItem{} + + for currentStart < len(transcriptionSentences) { + if currentStart > len(transcriptionSentences) { + break + } + + blockEnd := currentStart + blockSize + + if blockEnd >= len(transcriptionSentences) { + blockEnd = len(transcriptionSentences) - 1 + } + + currentSlice := transcriptionSentences[currentStart:blockEnd] + + userPromptText := "" + for i, sentence := range currentSlice { + userPromptText += fmt.Sprintf("%d: %s\n", i+currentStart, sentence) + } + + currentJson, err := json.Marshal(currentStructure) + currentPrompt := openai.ChatCompletionMessage{ + Role: openai.ChatMessageRoleUser, + Content: "Current structure is: " + string(currentJson), + } + + userPrompt := openai.ChatCompletionMessage{ + Role: openai.ChatMessageRoleUser, + Content: "Current transcription slice: " + userPromptText, + } + + resp1, err := wt.client.CreateChatCompletion( + context.Background(), + openai.ChatCompletionRequest{ + Model: "mistral-small-instruct", + Messages: []openai.ChatCompletionMessage{systemPrompt, currentPrompt, userPrompt}, + ResponseFormat: &openai.ChatCompletionResponseFormat{ + Type: openai.ChatCompletionResponseFormatTypeJSONObject, + }, + }, + ) + + if err != nil { + fmt.Printf("Splitting error: %v\n", err) + return map[string]ParagraphItem{} + } + + var data map[string]ParagraphItem + json.Unmarshal([]byte(resp1.Choices[0].Message.Content), &data) + + for k, v := range data { + currentStructure[k] = v + } + + currentStart += blockSize + } + return currentStructure +} + +func (wt *StructureBuilder) Split(splitFile string, transcription string) ([]ParagraphItem, error) { + transcriptionSentences := strings.Split(transcription, ".") + + cache, err := wt.loadCache(splitFile) + + var paragraphs map[string]ParagraphItem + if wt.useCache && err == nil { + paragraphs = cache + } else { + paragraphs = wt.splitUsingLLM(transcriptionSentences) + wt.saveCache(splitFile, paragraphs) + } + + var startLines []int + for k, _ := range paragraphs { + q, err := strconv.Atoi(k) + if err != nil { + continue + } + startLines = append(startLines, q) + } + sort.Ints(startLines) + + var items []ParagraphItem + for i := range startLines { + q := len(transcriptionSentences) - 1 + + if i < len(startLines)-1 { + q = startLines[i+1] + } + + var currentParagraph string + for j := startLines[i]; j <= q; j++ { + currentParagraph += transcriptionSentences[j] + "." + } + newItem := paragraphs[strconv.Itoa(startLines[i])] + newItem.Content = currentParagraph + items = append(items, newItem) + } + return items, nil +} diff --git a/internal/llm/text_rewriter.go b/internal/llm/text_rewriter.go new file mode 100644 index 0000000..c456952 --- /dev/null +++ b/internal/llm/text_rewriter.go @@ -0,0 +1,99 @@ +/* +Copyright © 2024 Matteo Schiff + +*/ + +package llm + +import ( + "context" + "encoding/json" + "fmt" + "log" + "os" + "time" + + "git.underdesk.net/Matte23/transcriber/internal/utils" + "github.com/sashabaranov/go-openai" +) + +func RewriteText(client *openai.Client, paragraphs []ParagraphItem, finalFile string) { + outputFile, err := os.Create(finalFile) + if err != nil { + panic(err) + } + // close output file on exit and check for its returned error + defer func() { + if err := outputFile.Close(); err != nil { + panic(err) + } + }() + + systemPrompt := openai.ChatCompletionMessage{ + Role: openai.ChatMessageRoleSystem, + Content: `You are provided with: + + A transcription paragraph to rewrite, aimed at improving clarity, grammar, and flow while preserving the original meaning and details. + A JSON structure of the document that lists section and subsection titles, showing how this paragraph fits within the larger structure. + The current built document, containing all previously written sections up to this point. + +Your task: Rewrite the transcription paragraph using clear and polished language, while keeping all key information intact. Format the paragraph using Markdown for improved readability (e.g., bold for emphasis, bullet points if applicable, etc.). Do not add any new information or leave out any critical details. Focus solely on rewriting the paragraph provided in the transcription, and do not add headers, titles, extra context, or explanations beyond the paragraph's Markdown-formatted text. + +Only write the Markdown-formatted paragraph text in your response.`, + } + + structureJson, _ := json.Marshal(paragraphs) + + documentStructure := openai.ChatCompletionMessage{ + Role: openai.ChatMessageRoleUser, + Content: "Document structure is: " + string(structureJson), + } + + currentText := "" + + log.Println("Starting rewriting text") + for _, currentMessage := range paragraphs { + startTime := time.Now() + heading := "" + switch currentMessage.Type { + case "heading1": + heading += "# " + case "heading2": + heading += "## " + default: + heading += "### " + } + heading += currentMessage.Title + "\n" + currentText += heading + outputFile.Write([]byte(heading)) + + currentTranscription := openai.ChatCompletionMessage{ + Role: openai.ChatMessageRoleUser, + Content: "Paragraph to rewrite is: '" + currentMessage.Content + "'", + } + + currentDocument := openai.ChatCompletionMessage{ + Role: openai.ChatMessageRoleUser, + Content: "Current rewritten document is: \n\n" + currentText, + } + + resp1, err := client.CreateChatCompletion( + context.Background(), + openai.ChatCompletionRequest{ + Model: "mistral-small-instruct", + Messages: []openai.ChatCompletionMessage{systemPrompt, documentStructure, currentDocument, currentTranscription}, + }, + ) + + if err != nil { + fmt.Printf("LLM process error: %v\n", err) + return + } + result := resp1.Choices[0].Message.Content + + outputFile.Write([]byte(result + "\n\n")) + currentText += result + "\n\n" + + utils.MeasureTime(startTime, "Text rewrite iteration for "+currentMessage.Title) + } +} diff --git a/internal/llm/text_rewriter_large.go b/internal/llm/text_rewriter_large.go new file mode 100644 index 0000000..ccff283 --- /dev/null +++ b/internal/llm/text_rewriter_large.go @@ -0,0 +1,86 @@ +/* +Copyright © 2024 Matteo Schiff + +*/ + +package llm + +/*import ( + "context" + "fmt" + "log" + "os" + "time" + + "git.underdesk.net/Matte23/transcriber/utils" + "github.com/sashabaranov/go-openai" +) + +func RewriteTextLarge(client *openai.Client, transcription string) { + splittedTranscription = transcription + + outputFile, err := os.Create("output.md") + if err != nil { + panic(err) + } + // close output file on exit and check for its returned error + defer func() { + if err := outputFile.Close(); err != nil { + panic(err) + } + }() + + systemPrompt := openai.ChatCompletionMessage{ + Role: openai.ChatMessageRoleSystem, + Content: `You will receive a part of a transcription in English that you have to rewrite. Your task is to: + + Rewrite the transcription, ensuring that the content is clearly presented and properly structured. + Correct any English errors, including grammar, spelling, and punctuation. + Organize the discourse by adding appropriate headings, subheadings, and bullet points where needed. Use titles and subtitles to logically separate sections and make the content easy to follow. + Do not add any new information that is not present in the original transcription (for example don't insert a conclusion paragraph if it's not present in the original text). Do not change the meaning of the text. + Each response will be formatted in Markdown. Use the following guidelines: + Use #, ##, ### for titles and subtitles. + Use bullet points, numbered lists, and other Markdown formatting to present information clearly. + Use ** for bold. + Use paragraphs to separate distinct ideas or topics. + + Each message you receive will be a part of a larger transcription, so please ensure that the content flows naturally and coherently. You should revise the transcription as if it were a section of a longer document, but avoid duplicating any content.`, + } + + var messages []openai.ChatCompletionMessage + + log.Println("Starting rewriting text") + for _, currentMessage := range splittedTranscription { + startTime := time.Now() + messages = append(messages, openai.ChatCompletionMessage{ + Role: openai.ChatMessageRoleUser, + Content: currentMessage, + }) + + if len(messages) > 4 { + messages = messages[2:] + } + + resp1, err := client.CreateChatCompletion( + context.Background(), + openai.ChatCompletionRequest{ + Model: "mistral-small-instruct", + Messages: append([]openai.ChatCompletionMessage{systemPrompt}, messages...), + }, + ) + + if err != nil { + fmt.Printf("LLM process error: %v\n", err) + return + } + result := resp1.Choices[0].Message.Content + + outputFile.Write([]byte(result + "\n")) + + messages = append(messages, openai.ChatCompletionMessage{ + Role: openai.ChatMessageRoleAssistant, + Content: result, + }) + utils.MeasureTime(startTime, "Text rewrite iteration") + } +} */ diff --git a/internal/stt/whisper.go b/internal/stt/whisper.go new file mode 100644 index 0000000..bd2738c --- /dev/null +++ b/internal/stt/whisper.go @@ -0,0 +1,68 @@ +/* +Copyright © 2024 Matteo Schiff + +*/ + +package stt + +import ( + "context" + "fmt" + "os" + + "github.com/sashabaranov/go-openai" +) + +type WhisperTranscriver struct { + client *openai.Client + useCache bool +} + +func NewWhisperTranscriver(client *openai.Client) *WhisperTranscriver { + return &WhisperTranscriver{ + client: client, + useCache: true, + } +} + +func (wt *WhisperTranscriver) cacheFileName(inputAudio string) string { + return inputAudio + ".transcribed.txt" +} + +func (wt *WhisperTranscriver) loadCache(inputAudio string) (string, error) { + content, err := os.ReadFile(inputAudio) + + return string(content), err +} + +func (wt *WhisperTranscriver) saveCache(inputAudio string, content string) error { + err := os.WriteFile(inputAudio, []byte(content), 0666) + + return err +} + +func (wt *WhisperTranscriver) Transcribe(inputAudio string, outFile string) (string, error) { + cache, err := wt.loadCache(outFile) + + if wt.useCache && err == nil { + return cache, nil + } + + resp, err := wt.client.CreateTranscription( + context.Background(), + openai.AudioRequest{ + Model: "whisper-large-q5_0", + FilePath: inputAudio, + Language: "en", + }, + ) + + if err != nil { + fmt.Printf("Transcription error: %v\n", err) + return "", err + } + + _ = wt.saveCache(outFile, resp.Text) + + return resp.Text, nil +} diff --git a/internal/utils/utils.go b/internal/utils/utils.go new file mode 100644 index 0000000..11fa8dd --- /dev/null +++ b/internal/utils/utils.go @@ -0,0 +1,34 @@ +/* +Copyright © 2024 Matteo Schiff + +*/ + +package utils + +import ( + "log" + "strings" + "time" +) + +func MeasureTime(start time.Time, name string) { + elapsed := time.Since(start) + log.Printf("%s took %s", name, elapsed) +} + +func mergeElements(slice []string, n int) []string { + if n <= 0 { + return slice + } + + var result []string + for i := 0; i < len(slice); i += n { + end := i + n + if end > len(slice) { + end = len(slice) + } + merged := strings.Join(slice[i:end], ".") + result = append(result, merged) + } + return result +} diff --git a/internal/video/audio_extractor.go b/internal/video/audio_extractor.go new file mode 100644 index 0000000..839d2c2 --- /dev/null +++ b/internal/video/audio_extractor.go @@ -0,0 +1,32 @@ +/* +Copyright © 2024 Matteo Schiff + +*/ + +package video + +import ( + "os" + + ffmpeg "github.com/u2takey/ffmpeg-go" +) + +func ExtractVideo(videoFile string, audioFile string) { + if _, err := os.Stat(audioFile); err == nil { + // Audio already extracted, skip this step + return + } + + // Use ffmpeg-go to extract audio at 16 kHz + ffmpeg. + Input(videoFile). + Output(audioFile, ffmpeg.KwArgs{ + "ar": 16000, // Set audio sampling rate to 16 kHz + "ac": 1, // Set the number of audio channels to 1 (mono) + //"f": "segment", // Enable segmenting + //"segment_time": 600, // Split files every 600 seconds (10 minutes) + //"reset_timestamps": 1, // Reset timestamps in each segment + }). + OverWriteOutput(). // Overwrite if the output file already exists + Run() +} diff --git a/main.go b/main.go new file mode 100644 index 0000000..f3ef0fb --- /dev/null +++ b/main.go @@ -0,0 +1,11 @@ +/* +Copyright © 2024 Matteo Schiff + +*/ +package main + +import "git.underdesk.net/Matte23/transcriber/cmd" + +func main() { + cmd.Execute() +}