Add base code
This commit is contained in:
234
internal/llm/paragraph_detector.go
Normal file
234
internal/llm/paragraph_detector.go
Normal file
@@ -0,0 +1,234 @@
|
||||
/*
|
||||
Copyright © 2024 Matteo Schiff <matteo@underdesk.net>
|
||||
|
||||
*/
|
||||
|
||||
package llm
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/sashabaranov/go-openai"
|
||||
)
|
||||
|
||||
type ParagraphDetector struct {
|
||||
client *openai.Client
|
||||
useCache bool
|
||||
}
|
||||
|
||||
func NewParagraphDetector(client *openai.Client) *ParagraphDetector {
|
||||
return &ParagraphDetector{
|
||||
client: client,
|
||||
useCache: true,
|
||||
}
|
||||
}
|
||||
|
||||
func (wt *ParagraphDetector) cacheFileName(inputAudio string) string {
|
||||
return inputAudio + ".splitted.txt"
|
||||
}
|
||||
|
||||
func (wt *ParagraphDetector) loadCache(name string) ([]int, error) {
|
||||
contentJson, err := os.ReadFile(wt.cacheFileName(name))
|
||||
|
||||
var content []int
|
||||
err = json.Unmarshal(contentJson, &content)
|
||||
|
||||
return content, err
|
||||
}
|
||||
|
||||
func (wt *ParagraphDetector) saveCache(name string, content []int) error {
|
||||
contentJson, _ := json.Marshal(content)
|
||||
|
||||
err := os.WriteFile(wt.cacheFileName(name), contentJson, 0666)
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
type SplitResponse struct {
|
||||
Topics map[string]string `json:"Topics"`
|
||||
}
|
||||
|
||||
func (s SplitResponse) MarshalJSON() ([]byte, error) {
|
||||
return json.Marshal(struct{
|
||||
Topics map[string]string `json:"Topics"`
|
||||
}{
|
||||
Topics: s.Topics,
|
||||
})
|
||||
}
|
||||
|
||||
func (wt *ParagraphDetector) splitUsingLLM(transcriptionSentences []string) []int {
|
||||
systemPrompt := openai.ChatCompletionMessage{
|
||||
Role: openai.ChatMessageRoleSystem,
|
||||
//Content: "Identify topics in the following transcription. Topics are a group of sentences about the same argument. Format the output using a JSON integer array, where each item is the line number (integer). Each time you see one or two new topics, add the integer to the array.",
|
||||
Content: `
|
||||
You are given a transcription of a conversation or speech. Your task is to identify where new topics or arguments begin within the text. A new topic is marked by a shift in subject, a new argument being introduced, or a clear change in focus. Additionally, ensure that each identified segment (paragraph) contains more than just a few sentences. If necessary, group smaller sections together so that each segment is substantial.
|
||||
|
||||
Input: The transcription will be provided as a series of lines. Each line is numbered for easy reference. Your task is to read through the transcription and identify the points where a new topic or argument begins. If a segment is too short (fewer than three sentences), it should be grouped with an adjacent segment. For each identified new topic, write a brief title or summary describing it.
|
||||
|
||||
Output: The output should be formatted as a JSON object, where each key is the line number where a new topic starts, and the corresponding value is the title of that topic. For example:
|
||||
|
||||
json
|
||||
|
||||
{
|
||||
"0": "Title of First Topic",
|
||||
"19": "Title of Second Topic"
|
||||
}
|
||||
|
||||
Example:
|
||||
|
||||
Transcription:
|
||||
|
||||
vbnet
|
||||
|
||||
0: Hello, everyone. Today we're going to talk about climate change.
|
||||
1: It's a complex issue, but I'll try to break it down.
|
||||
2: Climate change refers to long-term shifts in temperatures and weather patterns.
|
||||
3: In this discussion, we will cover the causes, effects, and possible solutions.
|
||||
4: First, let's discuss the causes of climate change.
|
||||
5: There are several factors, including greenhouse gas emissions.
|
||||
6: Most emissions come from burning fossil fuels like coal, oil, and gas.
|
||||
7: Another key cause is deforestation, which reduces the number of trees that can absorb CO2.
|
||||
8: Deforestation not only affects CO2 levels but also disrupts ecosystems.
|
||||
9: Next, let's move on to the effects of climate change.
|
||||
10: Rising temperatures are one of the most obvious effects.
|
||||
11: This leads to melting ice caps, rising sea levels, and extreme weather events.
|
||||
12: We are already seeing more frequent and intense heatwaves, hurricanes, and floods.
|
||||
13: The impact on wildlife is also severe, with many species facing habitat loss.
|
||||
14: Finally, what can we do to address this issue?
|
||||
15: One solution is to reduce our carbon footprint by using energy more efficiently.
|
||||
16: Renewable energy sources, like wind and solar, play a big role here.
|
||||
17: Governments and organizations worldwide are investing in clean energy technologies.
|
||||
18: Individual actions, such as reducing waste and conserving water, also make a difference.
|
||||
19: That's all for today's discussion on climate change. Thank you for listening.
|
||||
|
||||
Expected Output:
|
||||
|
||||
json
|
||||
|
||||
{
|
||||
"0": "Introduction to Climate Change",
|
||||
"4": "Causes of Climate Change",
|
||||
"9": "Effects of Climate Change",
|
||||
"14": "Solutions to Climate Change"
|
||||
}
|
||||
|
||||
Instruction: Please read the transcription carefully and list the line numbers where each new topic or argument starts, along with a brief title summarizing that topic. Ensure that shorter segments are grouped with neighboring content to form more substantial paragraphs before marking the start of a new topic.`,
|
||||
}
|
||||
|
||||
currentStart := 0
|
||||
blockSize := 200
|
||||
|
||||
var startLines []int
|
||||
|
||||
for currentStart < len(transcriptionSentences) {
|
||||
if currentStart > len(transcriptionSentences) {
|
||||
break
|
||||
}
|
||||
|
||||
blockEnd := currentStart+blockSize
|
||||
|
||||
if blockEnd >= len(transcriptionSentences) {
|
||||
blockEnd = len(transcriptionSentences) - 1
|
||||
}
|
||||
|
||||
currentSlice := transcriptionSentences[currentStart : blockEnd]
|
||||
|
||||
userPromptText := ""
|
||||
for i, sentence := range currentSlice {
|
||||
userPromptText += fmt.Sprintf("%d: %s\n", i, sentence)
|
||||
}
|
||||
|
||||
userPrompt := openai.ChatCompletionMessage{
|
||||
Role: openai.ChatMessageRoleUser,
|
||||
Content: userPromptText,
|
||||
}
|
||||
|
||||
resp1, err := wt.client.CreateChatCompletion(
|
||||
context.Background(),
|
||||
openai.ChatCompletionRequest{
|
||||
Model: "mistral-7b-instruct-v0.3",
|
||||
Messages: []openai.ChatCompletionMessage{systemPrompt, userPrompt},
|
||||
ResponseFormat: &openai.ChatCompletionResponseFormat{
|
||||
Type: openai.ChatCompletionResponseFormatTypeJSONObject,
|
||||
JSONSchema: &openai.ChatCompletionResponseFormatJSONSchema{
|
||||
Name: "splitter",
|
||||
Strict: true,
|
||||
Schema: SplitResponse{},
|
||||
},
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
if err != nil {
|
||||
fmt.Printf("Splitting error: %v\n", err)
|
||||
return []int{}
|
||||
}
|
||||
fmt.Printf(resp1.Choices[0].Message.Content)
|
||||
var data map[string]string
|
||||
json.Unmarshal([]byte(resp1.Choices[0].Message.Content), &data)
|
||||
//startLinesRaw = append(startLinesRaw, strings.Split(resp1.Choices[0].Message.Content, ",")...)
|
||||
|
||||
for k, _ := range data {
|
||||
before, _, _ := strings.Cut(k, "-")
|
||||
q, err := strconv.Atoi(before)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
startLines = append(startLines, currentStart+q)
|
||||
}
|
||||
|
||||
fmt.Println(startLines)
|
||||
|
||||
currentStart += blockSize
|
||||
}
|
||||
/*fmt.Println(startLinesRaw)
|
||||
|
||||
var err error
|
||||
for i, n := range startLinesRaw {
|
||||
startLines[i], err = strconv.Atoi(n)
|
||||
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
}*/
|
||||
sort.Ints(startLines)
|
||||
return startLines
|
||||
}
|
||||
|
||||
func (wt *ParagraphDetector) Split(name string, transcription string) ([]string, error) {
|
||||
transcriptionSentences := strings.Split(transcription, ".")
|
||||
|
||||
cache, err := wt.loadCache(name)
|
||||
|
||||
var startLines []int
|
||||
if wt.useCache && err == nil {
|
||||
startLines = cache
|
||||
} else {
|
||||
startLines = wt.splitUsingLLM(transcriptionSentences)
|
||||
wt.saveCache(name, startLines)
|
||||
}
|
||||
|
||||
var splittedTranscription []string
|
||||
for i := range startLines {
|
||||
q := len(transcriptionSentences) - 1
|
||||
|
||||
if i < len(startLines) - 1 {
|
||||
q = startLines[i+1]
|
||||
}
|
||||
|
||||
var currentParagraph string
|
||||
for j := startLines[i]; j <= q; j++ {
|
||||
currentParagraph += transcriptionSentences[j] + "."
|
||||
}
|
||||
splittedTranscription = append(splittedTranscription, currentParagraph)
|
||||
}
|
||||
|
||||
|
||||
return splittedTranscription, nil
|
||||
}
|
||||
Reference in New Issue
Block a user