scriba/internal/llm/paragraph_detector.go

/*
Copyright © 2024 Matteo Schiff <matteo@underdesk.net>

*/

package llm

import (
	"context"
	"encoding/json"
	"fmt"
	"os"
	"sort"
	"strconv"
	"strings"

	"github.com/sashabaranov/go-openai"
)

type ParagraphDetector struct {
	client   *openai.Client
	useCache bool
}

func NewParagraphDetector(client *openai.Client) *ParagraphDetector {
	return &ParagraphDetector{
		client:   client,
		useCache: true,
	}
}

func (wt *ParagraphDetector) cacheFileName(inputAudio string) string {
	return inputAudio + ".splitted.txt"
}

func (wt *ParagraphDetector) loadCache(name string) ([]int, error) {
	contentJson, err := os.ReadFile(wt.cacheFileName(name))

	var content []int
	err = json.Unmarshal(contentJson, &content)

	return content, err
}

func (wt *ParagraphDetector) saveCache(name string, content []int) error {
	contentJson, _ := json.Marshal(content)

	err := os.WriteFile(wt.cacheFileName(name), contentJson, 0666)

	return err
}

type SplitResponse struct {
	Topics map[string]string `json:"Topics"`
}

func (s SplitResponse) MarshalJSON() ([]byte, error) {
    return json.Marshal(struct{
        Topics map[string]string `json:"Topics"`
    }{
        Topics: s.Topics,
    })
}

func (wt *ParagraphDetector) splitUsingLLM(transcriptionSentences []string) []int {
	systemPrompt := openai.ChatCompletionMessage{
		Role: openai.ChatMessageRoleSystem,
		//Content: "Identify topics in the following transcription. Topics are a group of sentences about the same argument. Format the output using a JSON integer array, where each item is the line number (integer). Each time you see one or two new topics, add the integer to the array.",
		Content: `
You are given a transcription of a conversation or speech. Your task is to identify where new topics or arguments begin within the text. A new topic is marked by a shift in subject, a new argument being introduced, or a clear change in focus. Additionally, ensure that each identified segment (paragraph) contains more than just a few sentences. If necessary, group smaller sections together so that each segment is substantial.

Input: The transcription will be provided as a series of lines. Each line is numbered for easy reference. Your task is to read through the transcription and identify the points where a new topic or argument begins. If a segment is too short (fewer than three sentences), it should be grouped with an adjacent segment. For each identified new topic, write a brief title or summary describing it.

Output: The output should be formatted as a JSON object, where each key is the line number where a new topic starts, and the corresponding value is the title of that topic. For example:

json

{
  "0": "Title of First Topic",
  "19": "Title of Second Topic"
}

Example:

Transcription:

vbnet

0: Hello, everyone. Today we're going to talk about climate change.
1: It's a complex issue, but I'll try to break it down.
2: Climate change refers to long-term shifts in temperatures and weather patterns.
3: In this discussion, we will cover the causes, effects, and possible solutions.
4: First, let's discuss the causes of climate change.
5: There are several factors, including greenhouse gas emissions.
6: Most emissions come from burning fossil fuels like coal, oil, and gas.
7: Another key cause is deforestation, which reduces the number of trees that can absorb CO2.
8: Deforestation not only affects CO2 levels but also disrupts ecosystems.
9: Next, let's move on to the effects of climate change.
10: Rising temperatures are one of the most obvious effects.
11: This leads to melting ice caps, rising sea levels, and extreme weather events.
12: We are already seeing more frequent and intense heatwaves, hurricanes, and floods.
13: The impact on wildlife is also severe, with many species facing habitat loss.
14: Finally, what can we do to address this issue?
15: One solution is to reduce our carbon footprint by using energy more efficiently.
16: Renewable energy sources, like wind and solar, play a big role here.
17: Governments and organizations worldwide are investing in clean energy technologies.
18: Individual actions, such as reducing waste and conserving water, also make a difference.
19: That's all for today's discussion on climate change. Thank you for listening.

Expected Output:

json

{
  "0": "Introduction to Climate Change",
  "4": "Causes of Climate Change",
  "9": "Effects of Climate Change",
  "14": "Solutions to Climate Change"
}

Instruction: Please read the transcription carefully and list the line numbers where each new topic or argument starts, along with a brief title summarizing that topic. Ensure that shorter segments are grouped with neighboring content to form more substantial paragraphs before marking the start of a new topic.`,
	}

	currentStart := 0
	blockSize := 200

	var startLines []int

	for currentStart < len(transcriptionSentences) {
		if currentStart > len(transcriptionSentences) {
			break
		}

		blockEnd := currentStart+blockSize

		if blockEnd >= len(transcriptionSentences) {
			blockEnd = len(transcriptionSentences) - 1
		}

		currentSlice := transcriptionSentences[currentStart : blockEnd]

		userPromptText := ""
		for i, sentence := range currentSlice {
			userPromptText += fmt.Sprintf("%d: %s\n", i, sentence)
		}

		userPrompt := openai.ChatCompletionMessage{
			Role:    openai.ChatMessageRoleUser,
			Content: userPromptText,
		}

		resp1, err := wt.client.CreateChatCompletion(
			context.Background(),
			openai.ChatCompletionRequest{
				Model:    "mistral-7b-instruct-v0.3",
				Messages: []openai.ChatCompletionMessage{systemPrompt, userPrompt},
				ResponseFormat: &openai.ChatCompletionResponseFormat{
					Type: openai.ChatCompletionResponseFormatTypeJSONObject,
					JSONSchema: &openai.ChatCompletionResponseFormatJSONSchema{
						Name: "splitter",
						Strict: true,
						Schema: SplitResponse{},
					},
				},
			},
		)

		if err != nil {
			fmt.Printf("Splitting error: %v\n", err)
			return []int{}
		}
		fmt.Printf(resp1.Choices[0].Message.Content)
		var data map[string]string
		json.Unmarshal([]byte(resp1.Choices[0].Message.Content), &data)
		//startLinesRaw = append(startLinesRaw, strings.Split(resp1.Choices[0].Message.Content, ",")...)

		for k, _ := range data {
			before, _, _ := strings.Cut(k, "-")
			q, err := strconv.Atoi(before)
			if err != nil {
				continue
			}
			startLines = append(startLines, currentStart+q)
		}

		fmt.Println(startLines)

		currentStart += blockSize
	}
	/*fmt.Println(startLinesRaw)

	var err error
	for i, n := range startLinesRaw {
		startLines[i], err = strconv.Atoi(n)

		if err != nil {
			return nil
		}
	}*/
	sort.Ints(startLines)
	return startLines
}

func (wt *ParagraphDetector) Split(name string, transcription string) ([]string, error) {
	transcriptionSentences := strings.Split(transcription, ".")

	cache, err := wt.loadCache(name)

	var startLines []int
	if wt.useCache && err == nil {
		startLines = cache
	} else {
		startLines = wt.splitUsingLLM(transcriptionSentences)
		wt.saveCache(name, startLines)
	}

	var splittedTranscription []string
	for i := range startLines {
		q := len(transcriptionSentences) - 1

		if i < len(startLines) - 1 {
			q = startLines[i+1]
		}

		var currentParagraph string
		for j := startLines[i]; j <= q; j++ {
			currentParagraph += transcriptionSentences[j] + "."
		}
		splittedTranscription = append(splittedTranscription, currentParagraph)
	}


	return splittedTranscription, nil
}