-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathesi-dictate.el
237 lines (197 loc) · 10.3 KB
/
esi-dictate.el
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
;;; esi-dictate.el --- Dictation with Real-Time Editing -*- lexical-binding: t; -*-
;; Copyright (c) 2024 Abhinav Tushar
;; Author: Abhinav Tushar <abhinav@lepisma.xyz>
;; Version: 0.3.0
;; Package-Requires: ((emacs "29") (llm "0.17.2"))
;; Keywords: speech
;; URL: https://github.com/lepisma/emacs-speech-input
;;; Commentary:
;; Dictation with Real-Time Editing
;; This file is not a part of GNU Emacs.
;;; License:
;; This program is free software: you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation, either version 3 of the License, or
;; (at your option) any later version.
;; This program is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;; GNU General Public License for more details.
;; You should have received a copy of the GNU General Public License
;; along with this program. If not, see <https://www.gnu.org/licenses/>.
;;; Code:
(require 'json)
(require 'llm)
(require 'llm-openai)
(defcustom esi-dictate-dg-api-key nil
"API Key for Deepgram."
:type 'string)
(defcustom esi-dictate-llm-provider nil
"LLM provider to use for corrections")
(defcustom esi-dictate-speech-final-hook nil
"Hook to keep functions that run once the speech utterance is
finalized from the ASR."
:type ':hook)
(defcustom esi-dictate-cursor "⤳"
"Symbol for voice cursor."
:type 'string)
(defvar esi-dictate--dg-process nil
"Process holding the deepgram script")
(defcustom esi-dictate-llm-prompt "You are a dictation assistant, you will be given transcript by the user with speech disfluencies, minor mistakes, and edits and you have to return a corrected transcript. The user might give you their stream of consciousness and you have to ensure that you correctly identify a request to edit and don't misfire. You don't have to generate any new information, just ensure fixes in spoken transcripts and edits as asked."
"System prompt for the LLM editor."
:type 'string)
(defcustom esi-dictate-fix-examples (list (cons "I wan to write about umm something related to food. My name is name is Abhinav"
"I want to write about umm something related to food. My name is Abhinav.")
(cons "Okay we will start. Let's write something about chairs. No not chairs, make it tables."
"Let's write something about tables.")
(cons "I want to write something that's difficult to transcribe and then try correcting that. Write my name as abcd. No separate the letters with . please"
"I want to write something that's difficult to transcribe and then try correcting that. Write my name as a.b.c.d.")
(cons "hi easy, what are you doing? It's e s i."
"hi esi, what are you doing?"))
"Example inputs and outputs for few shot learning of auto
edits. Change this to impact the behaviour of dictation
intelligence."
:type '(repeat (cons string string)))
(defface esi-dictate-intermittent-face
'((t (:inherit font-lock-comment-face)))
"Face for transcription that's intermittent from ASR and could
change later.")
(defface esi-dictate-context-face
'((t (:inherit link)))
"Face for marking region of text that's under consideration as
voice context (used in context-overlay). This is the region
that's send to the LLM for edits.")
(defface esi-dictate-cursor-face
'((t (:inherit default)))
"Face to use for the `esi-dictate-cursor'.")
(defvar esi-dictate-mode-map
(make-sparse-keymap)
"Keymap for `esi-dictate-mode'.")
(defvar-local esi-dictate-context-overlay nil
"Overlay that specifies the region to be used as context
for corrections. The end position of this is also the position
where insertion happens from the ASR transcription. We also use
this to track position of the context.")
(define-minor-mode esi-dictate-mode
"Toggle esi-dictate mode."
:init-value nil
:keymap esi-dictate-mode-map)
(defun esi-dictate--write-fix (edited-content)
"Make fixes to the voice context using the given `edited-content'."
(let ((beg-pos (overlay-start esi-dictate-context-overlay))
(end-pos (overlay-end esi-dictate-context-overlay))
(past-point (point)))
(delete-region beg-pos end-pos)
(goto-char beg-pos)
(insert edited-content)
;; We replicate save-excursion manually since we do full deletion and
;; replacement.
(let ((current-point (point)))
(if (<= past-point beg-pos)
(goto-char past-point)
(if (<= past-point end-pos)
(goto-char (min past-point current-point))
(goto-char (+ current-point (- past-point end-pos)))))
;; Recover the overlay
(move-overlay esi-dictate-context-overlay beg-pos current-point))))
(defun esi-dictate--call-llm (content)
"Perform general fixes to given `content' assuming it's coming
from dictation with speech disfluencies and other artifacts."
(let ((prompt (make-llm-chat-prompt :context esi-dictate-llm-prompt
:examples esi-dictate-fix-examples)))
(llm-chat-prompt-append-response prompt content)
(llm-chat-async esi-dictate-llm-provider prompt #'esi-dictate--write-fix
(lambda (err err-message) (message "[esi] Error %s: %s" err err-message)))))
(defun esi-dictate-fix-context ()
"Fix the context using the general transcription fixing
instructions."
(interactive)
(let ((beg-pos (overlay-start esi-dictate-context-overlay))
(end-pos (overlay-end esi-dictate-context-overlay)))
(esi-dictate--call-llm (buffer-substring-no-properties beg-pos end-pos))))
(defun esi-dictate--clear-process ()
(when esi-dictate--dg-process
(delete-process esi-dictate--dg-process)
(setq esi-dictate--dg-process nil)))
(defun esi-dictate-make-context-overlay ()
"Make and return new context overlay."
(let ((overlay (if (region-active-p)
(make-overlay (region-beginning) (region-end) nil nil t)
(make-overlay (point) (point) nil nil t))))
(overlay-put overlay 'face 'esi-dictate-context-face)
(overlay-put overlay 'after-string (propertize esi-dictate-cursor 'face 'esi-dictate-cursor-face))
overlay))
(defun esi-dictate-clear-context-overlay ()
(when esi-dictate-context-overlay
(delete-overlay esi-dictate-context-overlay))
(setq esi-dictate-context-overlay nil))
(defun esi-dictate-insert (transcription-item)
"Insert transcription object in the current buffer preserving the
semantics of intermittent results."
(let* ((id (alist-get 'start transcription-item))
(text (alist-get 'transcript (aref (alist-get 'alternatives (alist-get 'channel transcription-item)) 0)))
(prev-item (when (> (overlay-end esi-dictate-context-overlay) (point-min)) ;; Ensure the overlay isn't at (point-min)
(get-text-property (- (overlay-end esi-dictate-context-overlay) 1) 'esi-dictate-transcription-item))))
;; If previous item and current are the same utterance, delete the previous
;; item and then insert new one. This handles intermittent results from the
;; ASR.
(when (and prev-item (= id (alist-get 'start prev-item)))
(delete-region (get-text-property (- (overlay-end esi-dictate-context-overlay) 1) 'esi-dictate-start) (overlay-end esi-dictate-context-overlay)))
(let ((insertion-pos (overlay-end esi-dictate-context-overlay)))
(save-excursion
(goto-char insertion-pos)
(insert text " "))
(when (eq :false (alist-get 'is_final transcription-item))
(overlay-put (make-overlay insertion-pos (overlay-end esi-dictate-context-overlay)) 'face 'esi-dictate-intermittent-face))
;; Saving properties which will be read later to handle intermittent
;; results.
(put-text-property insertion-pos (overlay-end esi-dictate-context-overlay) 'esi-dictate-transcription-item transcription-item)
(put-text-property insertion-pos (overlay-end esi-dictate-context-overlay) 'esi-dictate-start insertion-pos)
;; This is utterance end according to the ASR. In this case, we run a
;; few hooks.
(when (not (eq :false (alist-get 'speech_final transcription-item)))
(run-hooks 'esi-dictate-speech-final-hook)))))
(defun esi-dictate-filter-fn (process string)
"Filter function to read the output from python script that
interacts with Deeepgram."
(let ((existing (or (process-get process 'accumulated-output) "")))
(setq existing (concat existing string))
(while (string-match "\n" existing)
(let ((line (substring existing 0 (match-beginning 0)))
(rest (substring existing (match-end 0))))
(setq existing rest)
(cond ((string-prefix-p "Output: " line)
(let ((json-string (substring line (length "Output: "))))
(esi-dictate-insert (json-parse-string json-string :object-type 'alist))))
((string-prefix-p "Press Enter to stop recording" line)
(message "[esi] Dictation mode ready to use.")
(esi-dictate-move-here)))))
(process-put process 'accumulated-output existing)))
(defun esi-dictate-move-here ()
"Move the voice cursor to the current point or, if active, the
current region."
(interactive)
(esi-dictate-clear-context-overlay)
(setq esi-dictate-context-overlay (esi-dictate-make-context-overlay)))
;;;###autoload
(defun esi-dictate-start ()
"Start the real-time transcription process to start inserting text
in current buffer."
(interactive)
(esi-dictate--clear-process)
(setq esi-dictate--dg-process
(let ((process-environment (cons (format "DG_API_KEY=%s" esi-dictate-dg-api-key) process-environment)))
(make-process :name "esi-dictate-dg"
:buffer "*esi-dictate-dg*"
:command (list "dg.py")
:filter #'esi-dictate-filter-fn)))
(esi-dictate-mode)
(message "[esi] Starting dictation mode ..."))
(defun esi-dictate-stop ()
(interactive)
(esi-dictate--clear-process)
(esi-dictate-mode -1)
(esi-dictate-clear-context-overlay)
(message "[esi] Stopped dictation mode."))
(provide 'esi-dictate)
;;; esi-dictate.el ends here