scriba/internal/llm/structure_builder.go

/*
Copyright © 2024 Matteo Schiff <matteo@underdesk.net>

*/

package llm

import (
	"context"
	"encoding/json"
	"fmt"
	"os"
	"sort"
	"strconv"
	"strings"

	"github.com/sashabaranov/go-openai"
)

type StructureBuilder struct {
	client   *openai.Client
	useCache bool
}

func NewStructureBuilder(client *openai.Client) *StructureBuilder {
	return &StructureBuilder{
		client:   client,
		useCache: true,
	}
}

func (wt *StructureBuilder) cacheFileName(inputAudio string) string {
	return inputAudio + ".splitted.txt"
}

func (wt *StructureBuilder) loadCache(name string) (map[string]ParagraphItem, error) {
	contentJson, err := os.ReadFile(name)

	var content map[string]ParagraphItem
	err = json.Unmarshal(contentJson, &content)

	return content, err
}

func (wt *StructureBuilder) saveCache(name string, content map[string]ParagraphItem) error {
	contentJson, _ := json.Marshal(content)

	err := os.WriteFile(name, contentJson, 0666)

	return err
}

type ParagraphItem struct {
	Title   string
	Type    string
	Content string `json:"-"`
}

func (wt *StructureBuilder) splitUsingLLM(transcriptionSentences []string) map[string]ParagraphItem {
	systemPrompt := openai.ChatCompletionMessage{
		Role: openai.ChatMessageRoleSystem,
		Content: `
You are given a transcription of a conversation or speech. Your task is to identify where new topics or arguments begin within the text. A new topic is marked by a shift in subject, a new argument being introduced, or a clear change in focus. Additionally, ensure that each identified segment (paragraph) contains more than just a few sentences.
For each identified new topic, write a brief title or summary describing it. Additionally, assign a heading type based on the importance or hierarchy of the topic. Use the following rules for heading types:

    "heading1" for main sections or major topics.
    "heading2" for subtopics or important subsections within a main topic.
    "heading3" for smaller, more detailed sections within a subtopic.

Input: The transcription will be provided as a series of lines. Each line is numbered for easy reference. Your task is to read through the transcription and identify the points where a new topic or argument begins.

The previous output will be provided as input to guide you on where new sections begin and their existing structure. Use this to extend the structure as needed.
Output: The output should be formatted as a JSON object, where each key is the line number where a new topic starts, and the corresponding value is an object containing the title of that topic and the heading type. For example:
Example:
{
  "0": {
    "Title": "Example title",
    "Type": "heading1"
  },
  "4": {
    "Title": "Example subtitle",
    "Type": "heading2"
  },
  "9": {
    "Title": "Another example subtitle",
    "Type": "heading2"
  },
  "14": {
    "Title": "A third example subtitle",
    "Type": "heading2"
  }
}

Instruction: Please read the transcription carefully and list the line numbers where each new topic or argument starts, along with a brief title summarizing that topic. Use the previous output to guide your extensions, adding titles and types ("heading1", "heading2", "heading3") as necessary.`,
	}

	currentStart := 0
	blockSize := 200

	currentStructure := map[string]ParagraphItem{}

	for currentStart < len(transcriptionSentences) {
		if currentStart > len(transcriptionSentences) {
			break
		}

		blockEnd := currentStart + blockSize

		if blockEnd >= len(transcriptionSentences) {
			blockEnd = len(transcriptionSentences) - 1
		}

		currentSlice := transcriptionSentences[currentStart:blockEnd]

		userPromptText := ""
		for i, sentence := range currentSlice {
			userPromptText += fmt.Sprintf("%d: %s\n", i+currentStart, sentence)
		}

		currentJson, err := json.Marshal(currentStructure)
		currentPrompt := openai.ChatCompletionMessage{
			Role:    openai.ChatMessageRoleUser,
			Content: "Current structure is: " + string(currentJson),
		}

		userPrompt := openai.ChatCompletionMessage{
			Role:    openai.ChatMessageRoleUser,
			Content: "Current transcription slice: " + userPromptText,
		}

		resp1, err := wt.client.CreateChatCompletion(
			context.Background(),
			openai.ChatCompletionRequest{
				Model:    "mistral-small-instruct",
				Messages: []openai.ChatCompletionMessage{systemPrompt, currentPrompt, userPrompt},
				ResponseFormat: &openai.ChatCompletionResponseFormat{
					Type: openai.ChatCompletionResponseFormatTypeJSONObject,
				},
			},
		)

		if err != nil {
			fmt.Printf("Splitting error: %v\n", err)
			return map[string]ParagraphItem{}
		}

		var data map[string]ParagraphItem
		json.Unmarshal([]byte(resp1.Choices[0].Message.Content), &data)

		for k, v := range data {
			currentStructure[k] = v
		}

		currentStart += blockSize
	}
	return currentStructure
}

func (wt *StructureBuilder) Split(splitFile string, transcription string) ([]ParagraphItem, error) {
	transcriptionSentences := strings.Split(transcription, ".")

	cache, err := wt.loadCache(splitFile)

	var paragraphs map[string]ParagraphItem
	if wt.useCache && err == nil {
		paragraphs = cache
	} else {
		paragraphs = wt.splitUsingLLM(transcriptionSentences)
		wt.saveCache(splitFile, paragraphs)
	}

	var startLines []int
	for k, _ := range paragraphs {
		q, err := strconv.Atoi(k)
		if err != nil {
			continue
		}
		startLines = append(startLines, q)
	}
	sort.Ints(startLines)

	var items []ParagraphItem
	for i := range startLines {
		q := len(transcriptionSentences) - 1

		if i < len(startLines)-1 {
			q = startLines[i+1]
		}

		var currentParagraph string
		for j := startLines[i]; j <= q; j++ {
			currentParagraph += transcriptionSentences[j] + "."
		}
		newItem := paragraphs[strconv.Itoa(startLines[i])]
		newItem.Content = currentParagraph
		items = append(items, newItem)
	}
	return items, nil
}