Add base code

2024-11-09 22:57:47 +01:00
parent 967f80d734
commit c7b40276cf
14 changed files with 975 additions and 0 deletions
--- a/internal/llm/paragraph_detector.go
+++ b/internal/llm/paragraph_detector.go
@@ -0,0 +1,234 @@
+/*
+Copyright © 2024 Matteo Schiff <matteo@underdesk.net>
+
+*/
+
+package llm
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"os"
+	"sort"
+	"strconv"
+	"strings"
+
+	"github.com/sashabaranov/go-openai"
+)
+
+type ParagraphDetector struct {
+	client   *openai.Client
+	useCache bool
+}
+
+func NewParagraphDetector(client *openai.Client) *ParagraphDetector {
+	return &ParagraphDetector{
+		client:   client,
+		useCache: true,
+	}
+}
+
+func (wt *ParagraphDetector) cacheFileName(inputAudio string) string {
+	return inputAudio + ".splitted.txt"
+}
+
+func (wt *ParagraphDetector) loadCache(name string) ([]int, error) {
+	contentJson, err := os.ReadFile(wt.cacheFileName(name))
+
+	var content []int
+	err = json.Unmarshal(contentJson, &content)
+
+	return content, err
+}
+
+func (wt *ParagraphDetector) saveCache(name string, content []int) error {
+	contentJson, _ := json.Marshal(content)
+
+	err := os.WriteFile(wt.cacheFileName(name), contentJson, 0666)
+
+	return err
+}
+
+type SplitResponse struct {
+	Topics map[string]string `json:"Topics"`
+}
+
+func (s SplitResponse) MarshalJSON() ([]byte, error) {
+    return json.Marshal(struct{
+        Topics map[string]string `json:"Topics"`
+    }{
+        Topics: s.Topics,
+    })
+}
+
+func (wt *ParagraphDetector) splitUsingLLM(transcriptionSentences []string) []int {
+	systemPrompt := openai.ChatCompletionMessage{
+		Role: openai.ChatMessageRoleSystem,
+		//Content: "Identify topics in the following transcription. Topics are a group of sentences about the same argument. Format the output using a JSON integer array, where each item is the line number (integer). Each time you see one or two new topics, add the integer to the array.",
+		Content: `
+You are given a transcription of a conversation or speech. Your task is to identify where new topics or arguments begin within the text. A new topic is marked by a shift in subject, a new argument being introduced, or a clear change in focus. Additionally, ensure that each identified segment (paragraph) contains more than just a few sentences. If necessary, group smaller sections together so that each segment is substantial.
+
+Input: The transcription will be provided as a series of lines. Each line is numbered for easy reference. Your task is to read through the transcription and identify the points where a new topic or argument begins. If a segment is too short (fewer than three sentences), it should be grouped with an adjacent segment. For each identified new topic, write a brief title or summary describing it.
+
+Output: The output should be formatted as a JSON object, where each key is the line number where a new topic starts, and the corresponding value is the title of that topic. For example:
+
+json
+
+{
+  "0": "Title of First Topic",
+  "19": "Title of Second Topic"
+}
+
+Example:
+
+Transcription:
+
+vbnet
+
+0: Hello, everyone. Today we're going to talk about climate change.
+1: It's a complex issue, but I'll try to break it down.
+2: Climate change refers to long-term shifts in temperatures and weather patterns.
+3: In this discussion, we will cover the causes, effects, and possible solutions.
+4: First, let's discuss the causes of climate change.
+5: There are several factors, including greenhouse gas emissions.
+6: Most emissions come from burning fossil fuels like coal, oil, and gas.
+7: Another key cause is deforestation, which reduces the number of trees that can absorb CO2.
+8: Deforestation not only affects CO2 levels but also disrupts ecosystems.
+9: Next, let's move on to the effects of climate change.
+10: Rising temperatures are one of the most obvious effects.
+11: This leads to melting ice caps, rising sea levels, and extreme weather events.
+12: We are already seeing more frequent and intense heatwaves, hurricanes, and floods.
+13: The impact on wildlife is also severe, with many species facing habitat loss.
+14: Finally, what can we do to address this issue?
+15: One solution is to reduce our carbon footprint by using energy more efficiently.
+16: Renewable energy sources, like wind and solar, play a big role here.
+17: Governments and organizations worldwide are investing in clean energy technologies.
+18: Individual actions, such as reducing waste and conserving water, also make a difference.
+19: That's all for today's discussion on climate change. Thank you for listening.
+
+Expected Output:
+
+json
+
+{
+  "0": "Introduction to Climate Change",
+  "4": "Causes of Climate Change",
+  "9": "Effects of Climate Change",
+  "14": "Solutions to Climate Change"
+}
+
+Instruction: Please read the transcription carefully and list the line numbers where each new topic or argument starts, along with a brief title summarizing that topic. Ensure that shorter segments are grouped with neighboring content to form more substantial paragraphs before marking the start of a new topic.`,
+	}
+
+	currentStart := 0
+	blockSize := 200
+
+	var startLines []int
+
+	for currentStart < len(transcriptionSentences) {
+		if currentStart > len(transcriptionSentences) {
+			break
+		}
+
+		blockEnd := currentStart+blockSize
+
+		if blockEnd >= len(transcriptionSentences) {
+			blockEnd = len(transcriptionSentences) - 1
+		}
+
+		currentSlice := transcriptionSentences[currentStart : blockEnd]
+
+		userPromptText := ""
+		for i, sentence := range currentSlice {
+			userPromptText += fmt.Sprintf("%d: %s\n", i, sentence)
+		}
+
+		userPrompt := openai.ChatCompletionMessage{
+			Role:    openai.ChatMessageRoleUser,
+			Content: userPromptText,
+		}
+
+		resp1, err := wt.client.CreateChatCompletion(
+			context.Background(),
+			openai.ChatCompletionRequest{
+				Model:    "mistral-7b-instruct-v0.3",
+				Messages: []openai.ChatCompletionMessage{systemPrompt, userPrompt},
+				ResponseFormat: &openai.ChatCompletionResponseFormat{
+					Type: openai.ChatCompletionResponseFormatTypeJSONObject,
+					JSONSchema: &openai.ChatCompletionResponseFormatJSONSchema{
+						Name: "splitter",
+						Strict: true,
+						Schema: SplitResponse{},
+					},
+				},
+			},
+		)
+
+		if err != nil {
+			fmt.Printf("Splitting error: %v\n", err)
+			return []int{}
+		}
+		fmt.Printf(resp1.Choices[0].Message.Content)
+		var data map[string]string
+		json.Unmarshal([]byte(resp1.Choices[0].Message.Content), &data)
+		//startLinesRaw = append(startLinesRaw, strings.Split(resp1.Choices[0].Message.Content, ",")...)
+
+		for k, _ := range data {
+			before, _, _ := strings.Cut(k, "-")
+			q, err := strconv.Atoi(before)
+			if err != nil {
+				continue
+			}
+			startLines = append(startLines, currentStart+q)
+		}
+
+		fmt.Println(startLines)
+
+		currentStart += blockSize
+	}
+	/*fmt.Println(startLinesRaw)
+
+	var err error
+	for i, n := range startLinesRaw {
+		startLines[i], err = strconv.Atoi(n)
+
+		if err != nil {
+			return nil
+		}
+	}*/
+	sort.Ints(startLines)
+	return startLines
+}
+
+func (wt *ParagraphDetector) Split(name string, transcription string) ([]string, error) {
+	transcriptionSentences := strings.Split(transcription, ".")
+
+	cache, err := wt.loadCache(name)
+
+	var startLines []int
+	if wt.useCache && err == nil {
+		startLines = cache
+	} else {
+		startLines = wt.splitUsingLLM(transcriptionSentences)
+		wt.saveCache(name, startLines)
+	}
+
+	var splittedTranscription []string
+	for i := range startLines {
+		q := len(transcriptionSentences) - 1
+
+		if i < len(startLines) - 1 {
+			q = startLines[i+1]
+		}
+
+		var currentParagraph string
+		for j := startLines[i]; j <= q; j++ {
+			currentParagraph += transcriptionSentences[j] + "."
+		}
+		splittedTranscription = append(splittedTranscription, currentParagraph)
+	}
+
+
+	return splittedTranscription, nil
+}