Add base code

2024-11-09 22:57:47 +01:00
parent 967f80d734
commit c7b40276cf
14 changed files with 975 additions and 0 deletions
--- a/internal/llm/paragraph_detector.go
+++ b/internal/llm/paragraph_detector.go
@@ -0,0 +1,234 @@
+/*
+Copyright © 2024 Matteo Schiff <matteo@underdesk.net>
+
+*/
+
+package llm
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"os"
+	"sort"
+	"strconv"
+	"strings"
+
+	"github.com/sashabaranov/go-openai"
+)
+
+type ParagraphDetector struct {
+	client   *openai.Client
+	useCache bool
+}
+
+func NewParagraphDetector(client *openai.Client) *ParagraphDetector {
+	return &ParagraphDetector{
+		client:   client,
+		useCache: true,
+	}
+}
+
+func (wt *ParagraphDetector) cacheFileName(inputAudio string) string {
+	return inputAudio + ".splitted.txt"
+}
+
+func (wt *ParagraphDetector) loadCache(name string) ([]int, error) {
+	contentJson, err := os.ReadFile(wt.cacheFileName(name))
+
+	var content []int
+	err = json.Unmarshal(contentJson, &content)
+
+	return content, err
+}
+
+func (wt *ParagraphDetector) saveCache(name string, content []int) error {
+	contentJson, _ := json.Marshal(content)
+
+	err := os.WriteFile(wt.cacheFileName(name), contentJson, 0666)
+
+	return err
+}
+
+type SplitResponse struct {
+	Topics map[string]string `json:"Topics"`
+}
+
+func (s SplitResponse) MarshalJSON() ([]byte, error) {
+    return json.Marshal(struct{
+        Topics map[string]string `json:"Topics"`
+    }{
+        Topics: s.Topics,
+    })
+}
+
+func (wt *ParagraphDetector) splitUsingLLM(transcriptionSentences []string) []int {
+	systemPrompt := openai.ChatCompletionMessage{
+		Role: openai.ChatMessageRoleSystem,
+		//Content: "Identify topics in the following transcription. Topics are a group of sentences about the same argument. Format the output using a JSON integer array, where each item is the line number (integer). Each time you see one or two new topics, add the integer to the array.",
+		Content: `
+You are given a transcription of a conversation or speech. Your task is to identify where new topics or arguments begin within the text. A new topic is marked by a shift in subject, a new argument being introduced, or a clear change in focus. Additionally, ensure that each identified segment (paragraph) contains more than just a few sentences. If necessary, group smaller sections together so that each segment is substantial.
+
+Input: The transcription will be provided as a series of lines. Each line is numbered for easy reference. Your task is to read through the transcription and identify the points where a new topic or argument begins. If a segment is too short (fewer than three sentences), it should be grouped with an adjacent segment. For each identified new topic, write a brief title or summary describing it.
+
+Output: The output should be formatted as a JSON object, where each key is the line number where a new topic starts, and the corresponding value is the title of that topic. For example:
+
+json
+
+{
+  "0": "Title of First Topic",
+  "19": "Title of Second Topic"
+}
+
+Example:
+
+Transcription:
+
+vbnet
+
+0: Hello, everyone. Today we're going to talk about climate change.
+1: It's a complex issue, but I'll try to break it down.
+2: Climate change refers to long-term shifts in temperatures and weather patterns.
+3: In this discussion, we will cover the causes, effects, and possible solutions.
+4: First, let's discuss the causes of climate change.
+5: There are several factors, including greenhouse gas emissions.
+6: Most emissions come from burning fossil fuels like coal, oil, and gas.
+7: Another key cause is deforestation, which reduces the number of trees that can absorb CO2.
+8: Deforestation not only affects CO2 levels but also disrupts ecosystems.
+9: Next, let's move on to the effects of climate change.
+10: Rising temperatures are one of the most obvious effects.
+11: This leads to melting ice caps, rising sea levels, and extreme weather events.
+12: We are already seeing more frequent and intense heatwaves, hurricanes, and floods.
+13: The impact on wildlife is also severe, with many species facing habitat loss.
+14: Finally, what can we do to address this issue?
+15: One solution is to reduce our carbon footprint by using energy more efficiently.
+16: Renewable energy sources, like wind and solar, play a big role here.
+17: Governments and organizations worldwide are investing in clean energy technologies.
+18: Individual actions, such as reducing waste and conserving water, also make a difference.
+19: That's all for today's discussion on climate change. Thank you for listening.
+
+Expected Output:
+
+json
+
+{
+  "0": "Introduction to Climate Change",
+  "4": "Causes of Climate Change",
+  "9": "Effects of Climate Change",
+  "14": "Solutions to Climate Change"
+}
+
+Instruction: Please read the transcription carefully and list the line numbers where each new topic or argument starts, along with a brief title summarizing that topic. Ensure that shorter segments are grouped with neighboring content to form more substantial paragraphs before marking the start of a new topic.`,
+	}
+
+	currentStart := 0
+	blockSize := 200
+
+	var startLines []int
+
+	for currentStart < len(transcriptionSentences) {
+		if currentStart > len(transcriptionSentences) {
+			break
+		}
+
+		blockEnd := currentStart+blockSize
+
+		if blockEnd >= len(transcriptionSentences) {
+			blockEnd = len(transcriptionSentences) - 1
+		}
+
+		currentSlice := transcriptionSentences[currentStart : blockEnd]
+
+		userPromptText := ""
+		for i, sentence := range currentSlice {
+			userPromptText += fmt.Sprintf("%d: %s\n", i, sentence)
+		}
+
+		userPrompt := openai.ChatCompletionMessage{
+			Role:    openai.ChatMessageRoleUser,
+			Content: userPromptText,
+		}
+
+		resp1, err := wt.client.CreateChatCompletion(
+			context.Background(),
+			openai.ChatCompletionRequest{
+				Model:    "mistral-7b-instruct-v0.3",
+				Messages: []openai.ChatCompletionMessage{systemPrompt, userPrompt},
+				ResponseFormat: &openai.ChatCompletionResponseFormat{
+					Type: openai.ChatCompletionResponseFormatTypeJSONObject,
+					JSONSchema: &openai.ChatCompletionResponseFormatJSONSchema{
+						Name: "splitter",
+						Strict: true,
+						Schema: SplitResponse{},
+					},
+				},
+			},
+		)
+
+		if err != nil {
+			fmt.Printf("Splitting error: %v\n", err)
+			return []int{}
+		}
+		fmt.Printf(resp1.Choices[0].Message.Content)
+		var data map[string]string
+		json.Unmarshal([]byte(resp1.Choices[0].Message.Content), &data)
+		//startLinesRaw = append(startLinesRaw, strings.Split(resp1.Choices[0].Message.Content, ",")...)
+
+		for k, _ := range data {
+			before, _, _ := strings.Cut(k, "-")
+			q, err := strconv.Atoi(before)
+			if err != nil {
+				continue
+			}
+			startLines = append(startLines, currentStart+q)
+		}
+
+		fmt.Println(startLines)
+
+		currentStart += blockSize
+	}
+	/*fmt.Println(startLinesRaw)
+
+	var err error
+	for i, n := range startLinesRaw {
+		startLines[i], err = strconv.Atoi(n)
+
+		if err != nil {
+			return nil
+		}
+	}*/
+	sort.Ints(startLines)
+	return startLines
+}
+
+func (wt *ParagraphDetector) Split(name string, transcription string) ([]string, error) {
+	transcriptionSentences := strings.Split(transcription, ".")
+
+	cache, err := wt.loadCache(name)
+
+	var startLines []int
+	if wt.useCache && err == nil {
+		startLines = cache
+	} else {
+		startLines = wt.splitUsingLLM(transcriptionSentences)
+		wt.saveCache(name, startLines)
+	}
+
+	var splittedTranscription []string
+	for i := range startLines {
+		q := len(transcriptionSentences) - 1
+
+		if i < len(startLines) - 1 {
+			q = startLines[i+1]
+		}
+
+		var currentParagraph string
+		for j := startLines[i]; j <= q; j++ {
+			currentParagraph += transcriptionSentences[j] + "."
+		}
+		splittedTranscription = append(splittedTranscription, currentParagraph)
+	}
+
+
+	return splittedTranscription, nil
+}
--- a/internal/llm/structure_builder.go
+++ b/internal/llm/structure_builder.go
@@ -0,0 +1,199 @@
+/*
+Copyright © 2024 Matteo Schiff <matteo@underdesk.net>
+
+*/
+
+package llm
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"os"
+	"sort"
+	"strconv"
+	"strings"
+
+	"github.com/sashabaranov/go-openai"
+)
+
+type StructureBuilder struct {
+	client   *openai.Client
+	useCache bool
+}
+
+func NewStructureBuilder(client *openai.Client) *StructureBuilder {
+	return &StructureBuilder{
+		client:   client,
+		useCache: true,
+	}
+}
+
+func (wt *StructureBuilder) cacheFileName(inputAudio string) string {
+	return inputAudio + ".splitted.txt"
+}
+
+func (wt *StructureBuilder) loadCache(name string) (map[string]ParagraphItem, error) {
+	contentJson, err := os.ReadFile(name)
+
+	var content map[string]ParagraphItem
+	err = json.Unmarshal(contentJson, &content)
+
+	return content, err
+}
+
+func (wt *StructureBuilder) saveCache(name string, content map[string]ParagraphItem) error {
+	contentJson, _ := json.Marshal(content)
+
+	err := os.WriteFile(name, contentJson, 0666)
+
+	return err
+}
+
+type ParagraphItem struct {
+	Title   string
+	Type    string
+	Content string `json:"-"`
+}
+
+func (wt *StructureBuilder) splitUsingLLM(transcriptionSentences []string) map[string]ParagraphItem {
+	systemPrompt := openai.ChatCompletionMessage{
+		Role: openai.ChatMessageRoleSystem,
+		Content: `
+You are given a transcription of a conversation or speech. Your task is to identify where new topics or arguments begin within the text. A new topic is marked by a shift in subject, a new argument being introduced, or a clear change in focus. Additionally, ensure that each identified segment (paragraph) contains more than just a few sentences.
+For each identified new topic, write a brief title or summary describing it. Additionally, assign a heading type based on the importance or hierarchy of the topic. Use the following rules for heading types:
+
+    "heading1" for main sections or major topics.
+    "heading2" for subtopics or important subsections within a main topic.
+    "heading3" for smaller, more detailed sections within a subtopic.
+
+Input: The transcription will be provided as a series of lines. Each line is numbered for easy reference. Your task is to read through the transcription and identify the points where a new topic or argument begins.
+
+The previous output will be provided as input to guide you on where new sections begin and their existing structure. Use this to extend the structure as needed.
+Output: The output should be formatted as a JSON object, where each key is the line number where a new topic starts, and the corresponding value is an object containing the title of that topic and the heading type. For example:
+Example:
+{
+  "0": {
+    "Title": "Example title",
+    "Type": "heading1"
+  },
+  "4": {
+    "Title": "Example subtitle",
+    "Type": "heading2"
+  },
+  "9": {
+    "Title": "Another example subtitle",
+    "Type": "heading2"
+  },
+  "14": {
+    "Title": "A third example subtitle",
+    "Type": "heading2"
+  }
+}
+
+Instruction: Please read the transcription carefully and list the line numbers where each new topic or argument starts, along with a brief title summarizing that topic. Use the previous output to guide your extensions, adding titles and types ("heading1", "heading2", "heading3") as necessary.`,
+	}
+
+	currentStart := 0
+	blockSize := 200
+
+	currentStructure := map[string]ParagraphItem{}
+
+	for currentStart < len(transcriptionSentences) {
+		if currentStart > len(transcriptionSentences) {
+			break
+		}
+
+		blockEnd := currentStart + blockSize
+
+		if blockEnd >= len(transcriptionSentences) {
+			blockEnd = len(transcriptionSentences) - 1
+		}
+
+		currentSlice := transcriptionSentences[currentStart:blockEnd]
+
+		userPromptText := ""
+		for i, sentence := range currentSlice {
+			userPromptText += fmt.Sprintf("%d: %s\n", i+currentStart, sentence)
+		}
+
+		currentJson, err := json.Marshal(currentStructure)
+		currentPrompt := openai.ChatCompletionMessage{
+			Role:    openai.ChatMessageRoleUser,
+			Content: "Current structure is: " + string(currentJson),
+		}
+
+		userPrompt := openai.ChatCompletionMessage{
+			Role:    openai.ChatMessageRoleUser,
+			Content: "Current transcription slice: " + userPromptText,
+		}
+
+		resp1, err := wt.client.CreateChatCompletion(
+			context.Background(),
+			openai.ChatCompletionRequest{
+				Model:    "mistral-small-instruct",
+				Messages: []openai.ChatCompletionMessage{systemPrompt, currentPrompt, userPrompt},
+				ResponseFormat: &openai.ChatCompletionResponseFormat{
+					Type: openai.ChatCompletionResponseFormatTypeJSONObject,
+				},
+			},
+		)
+
+		if err != nil {
+			fmt.Printf("Splitting error: %v\n", err)
+			return map[string]ParagraphItem{}
+		}
+
+		var data map[string]ParagraphItem
+		json.Unmarshal([]byte(resp1.Choices[0].Message.Content), &data)
+
+		for k, v := range data {
+			currentStructure[k] = v
+		}
+
+		currentStart += blockSize
+	}
+	return currentStructure
+}
+
+func (wt *StructureBuilder) Split(splitFile string, transcription string) ([]ParagraphItem, error) {
+	transcriptionSentences := strings.Split(transcription, ".")
+
+	cache, err := wt.loadCache(splitFile)
+
+	var paragraphs map[string]ParagraphItem
+	if wt.useCache && err == nil {
+		paragraphs = cache
+	} else {
+		paragraphs = wt.splitUsingLLM(transcriptionSentences)
+		wt.saveCache(splitFile, paragraphs)
+	}
+
+	var startLines []int
+	for k, _ := range paragraphs {
+		q, err := strconv.Atoi(k)
+		if err != nil {
+			continue
+		}
+		startLines = append(startLines, q)
+	}
+	sort.Ints(startLines)
+
+	var items []ParagraphItem
+	for i := range startLines {
+		q := len(transcriptionSentences) - 1
+
+		if i < len(startLines)-1 {
+			q = startLines[i+1]
+		}
+
+		var currentParagraph string
+		for j := startLines[i]; j <= q; j++ {
+			currentParagraph += transcriptionSentences[j] + "."
+		}
+		newItem := paragraphs[strconv.Itoa(startLines[i])]
+		newItem.Content = currentParagraph
+		items = append(items, newItem)
+	}
+	return items, nil
+}
--- a/internal/llm/text_rewriter.go
+++ b/internal/llm/text_rewriter.go
@@ -0,0 +1,99 @@
+/*
+Copyright © 2024 Matteo Schiff <matteo@underdesk.net>
+
+*/
+
+package llm
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"log"
+	"os"
+	"time"
+
+	"git.underdesk.net/Matte23/transcriber/internal/utils"
+	"github.com/sashabaranov/go-openai"
+)
+
+func RewriteText(client *openai.Client, paragraphs []ParagraphItem, finalFile string) {
+	outputFile, err := os.Create(finalFile)
+	if err != nil {
+		panic(err)
+	}
+	// close output file on exit and check for its returned error
+	defer func() {
+		if err := outputFile.Close(); err != nil {
+			panic(err)
+		}
+	}()
+
+	systemPrompt := openai.ChatCompletionMessage{
+		Role: openai.ChatMessageRoleSystem,
+		Content: `You are provided with:
+
+    A transcription paragraph to rewrite, aimed at improving clarity, grammar, and flow while preserving the original meaning and details.
+    A JSON structure of the document that lists section and subsection titles, showing how this paragraph fits within the larger structure.
+    The current built document, containing all previously written sections up to this point.
+
+Your task: Rewrite the transcription paragraph using clear and polished language, while keeping all key information intact. Format the paragraph using Markdown for improved readability (e.g., bold for emphasis, bullet points if applicable, etc.). Do not add any new information or leave out any critical details. Focus solely on rewriting the paragraph provided in the transcription, and do not add headers, titles, extra context, or explanations beyond the paragraph's Markdown-formatted text.
+
+Only write the Markdown-formatted paragraph text in your response.`,
+	}
+
+	structureJson, _ := json.Marshal(paragraphs)
+
+	documentStructure := openai.ChatCompletionMessage{
+		Role:    openai.ChatMessageRoleUser,
+		Content: "Document structure is: " + string(structureJson),
+	}
+
+	currentText := ""
+
+	log.Println("Starting rewriting text")
+	for _, currentMessage := range paragraphs {
+		startTime := time.Now()
+		heading := ""
+		switch currentMessage.Type {
+		case "heading1":
+			heading += "# "
+		case "heading2":
+			heading += "## "
+		default:
+			heading += "### "
+		}
+		heading += currentMessage.Title + "\n"
+		currentText += heading
+		outputFile.Write([]byte(heading))
+
+		currentTranscription := openai.ChatCompletionMessage{
+			Role:    openai.ChatMessageRoleUser,
+			Content: "Paragraph to rewrite is: '" + currentMessage.Content + "'",
+		}
+
+		currentDocument := openai.ChatCompletionMessage{
+			Role:    openai.ChatMessageRoleUser,
+			Content: "Current rewritten document is: \n\n" + currentText,
+		}
+
+		resp1, err := client.CreateChatCompletion(
+			context.Background(),
+			openai.ChatCompletionRequest{
+				Model:    "mistral-small-instruct",
+				Messages: []openai.ChatCompletionMessage{systemPrompt, documentStructure, currentDocument, currentTranscription},
+			},
+		)
+
+		if err != nil {
+			fmt.Printf("LLM process error: %v\n", err)
+			return
+		}
+		result := resp1.Choices[0].Message.Content
+
+		outputFile.Write([]byte(result + "\n\n"))
+		currentText += result + "\n\n"
+
+		utils.MeasureTime(startTime, "Text rewrite iteration for "+currentMessage.Title)
+	}
+}
--- a/internal/llm/text_rewriter_large.go
+++ b/internal/llm/text_rewriter_large.go
@@ -0,0 +1,86 @@
+/*
+Copyright © 2024 Matteo Schiff <matteo@underdesk.net>
+
+*/
+
+package llm
+
+/*import (
+	"context"
+	"fmt"
+	"log"
+	"os"
+	"time"
+
+	"git.underdesk.net/Matte23/transcriber/utils"
+	"github.com/sashabaranov/go-openai"
+)
+
+func RewriteTextLarge(client *openai.Client, transcription string) {
+	splittedTranscription = transcription
+
+	outputFile, err := os.Create("output.md")
+	if err != nil {
+		panic(err)
+	}
+	// close output file on exit and check for its returned error
+	defer func() {
+		if err := outputFile.Close(); err != nil {
+			panic(err)
+		}
+	}()
+
+	systemPrompt := openai.ChatCompletionMessage{
+		Role: openai.ChatMessageRoleSystem,
+		Content: `You will receive a part of a transcription in English that you have to rewrite. Your task is to:
+
+	    Rewrite the transcription, ensuring that the content is clearly presented and properly structured.
+	    Correct any English errors, including grammar, spelling, and punctuation.
+	    Organize the discourse by adding appropriate headings, subheadings, and bullet points where needed. Use titles and subtitles to logically separate sections and make the content easy to follow.
+	    Do not add any new information that is not present in the original transcription (for example don't insert a conclusion paragraph if it's not present in the original text). Do not change the meaning of the text.
+	    Each response will be formatted in Markdown. Use the following guidelines:
+	        Use #, ##, ### for titles and subtitles.
+	        Use bullet points, numbered lists, and other Markdown formatting to present information clearly.
+			Use ** for bold.
+	        Use paragraphs to separate distinct ideas or topics.
+
+		Each message you receive will be a part of a larger transcription, so please ensure that the content flows naturally and coherently. You should revise the transcription as if it were a section of a longer document, but avoid duplicating any content.`,
+	}
+
+	var messages []openai.ChatCompletionMessage
+
+	log.Println("Starting rewriting text")
+	for _, currentMessage := range splittedTranscription {
+		startTime := time.Now()
+		messages = append(messages, openai.ChatCompletionMessage{
+			Role:    openai.ChatMessageRoleUser,
+			Content: currentMessage,
+		})
+
+		if len(messages) > 4 {
+			messages = messages[2:]
+		}
+
+		resp1, err := client.CreateChatCompletion(
+			context.Background(),
+			openai.ChatCompletionRequest{
+				Model:    "mistral-small-instruct",
+				Messages: append([]openai.ChatCompletionMessage{systemPrompt}, messages...),
+			},
+		)
+
+		if err != nil {
+			fmt.Printf("LLM process error: %v\n", err)
+			return
+		}
+		result := resp1.Choices[0].Message.Content
+
+		outputFile.Write([]byte(result + "\n"))
+
+		messages = append(messages, openai.ChatCompletionMessage{
+			Role:    openai.ChatMessageRoleAssistant,
+			Content: result,
+		})
+		utils.MeasureTime(startTime, "Text rewrite iteration")
+	}
+} */