7575 color : # f9f9f9 ;
7676 border : 1px solid # 666 ;
7777 }
78+
79+ .audio-container {
80+ margin : 10px 0 ;
81+ padding : 8px ;
82+ background : # 333 ;
83+ border : 1px solid # 444 ;
84+ border-radius : 4px ;
85+ }
86+
87+ .volume-control {
88+ display : flex;
89+ align-items : center;
90+ gap : 8px ;
91+ }
92+
93+ # volumeSlider {
94+ width : 100px ;
95+ accent-color : # 555 ;
96+ }
97+
98+ # muteButton {
99+ padding : 4px 8px ;
100+ margin-right : 8px ;
101+ background : # 444 ;
102+ border : 1px solid # 555 ;
103+ border-radius : 4px ;
104+ cursor : pointer;
105+ }
78106 </ style >
79107</ head >
80108
@@ -123,6 +151,16 @@ <h1>Chat with GPT-4 Realtime with WebRTC</h1>
123151 < span id ="autoSaveStatus "> </ span >
124152</ div >
125153
154+ <!-- Audio element for remote audio -->
155+ < div class ="audio-container ">
156+ < audio id ="remoteAudio " autoplay > </ audio >
157+ < div class ="volume-control ">
158+ < label for ="volumeSlider "> Assistant Audio Control: </ label >
159+ < button id ="muteButton "> 🔊</ button >
160+ < input type ="range " id ="volumeSlider " min ="0 " max ="1 " step ="0.1 " value ="1 ">
161+ </ div >
162+ </ div >
163+
126164<!-- Log container with an input field at the bottom -->
127165< div id ="log ">
128166 < div id ="inputContainer ">
@@ -143,12 +181,16 @@ <h1>Chat with GPT-4 Realtime with WebRTC</h1>
143181-->
144182< script id ="rtc-logic ">
145183 // Globals
146- let pc ; // RTCPeerConnection
147- let track ; // Local audio track
148- let dc ; // Data channel
184+ let pc ; // RTCPeerConnection
185+ let track ; // Local audio track
186+ let dc ; // Data channel
149187 const assistantResults = { } ; // Track interim/final transcripts
150188 const userMessages = { } ; // Track user messages per item ID
151189
190+ // Expose to console for debugging & fun
191+ window . pc = pc ;
192+ window . track = track ;
193+ window . dc = dc ;
152194
153195 // Model & function definitions
154196 const model = "gpt-4o-mini-realtime-preview" ;
@@ -174,7 +216,6 @@ <h1>Chat with GPT-4 Realtime with WebRTC</h1>
174216 const payload = {
175217 model : model ,
176218 voice : voiceEl . value ,
177- input_audio_format : "pcm16" ,
178219 input_audio_transcription : { model : "whisper-1" } ,
179220 instructions : sessionInstructionsEl . value
180221 } ;
@@ -244,9 +285,6 @@ <h1>Chat with GPT-4 Realtime with WebRTC</h1>
244285 if ( name === "end_session" ) {
245286 console . log ( "Ending session based on user request" ) ;
246287 endSession ( ) ;
247- } else if ( name === "close_tab" ) {
248- console . log ( "User confirmed closing tab" ) ;
249- endSession ( "Tell the user they will need to navigate back to this page to start a new chat." ) ;
250288 }
251289 break ;
252290
@@ -268,15 +306,33 @@ <h1>Chat with GPT-4 Realtime with WebRTC</h1>
268306 if ( ! apiKey || apiKey === "Your OPENAI_API_KEY from platform.openai.com" ) {
269307 console . error ( "No OpenAI API Key provided" ) ;
270308 appendOrUpdateLog ( "Error: No OpenAI API Key provided." , "system-message" ) ;
309+ toggleSessionButtons ( false ) ;
271310 return ;
272311 }
273312
274313 // An ephemeral key is required to start a session
275314 // Usually this should be requested from your server to avoid exposing the OPENAI_API_KEY
276315 const ephemeralKey = await fetchEphemeralKey ( apiKey ) ;
277316 if ( ! ephemeralKey || ephemeralKey === "error" ) {
278- toggleSessionButtons ( false ) ;
279- return ;
317+ toggleSessionButtons ( false ) ;
318+ return ;
319+ }
320+
321+ // Capture the local mic
322+ // technically optional for the API, but required in this example
323+ let stream ;
324+ try {
325+ stream = await navigator . mediaDevices . getUserMedia ( { audio : true } ) ;
326+ if ( ! stream ) {
327+ console . error ( "Failed to get local stream" ) ;
328+ return ;
329+ }
330+ track = stream . getTracks ( ) [ 0 ] ;
331+ } catch ( err ) {
332+ console . error ( "Error accessing mic:" , err ) ;
333+ appendOrUpdateLog ( "Mic access error. Check permissions." , "system-message" ) ;
334+ toggleSessionButtons ( false ) ;
335+ return ;
280336 }
281337
282338 // Start the WebRTC session
@@ -285,31 +341,15 @@ <h1>Chat with GPT-4 Realtime with WebRTC</h1>
285341 pc = new RTCPeerConnection ( ) ;
286342
287343 // Create an audio element for the remote track
288- const audioEl = document . createElement ( "audio" ) ;
289- audioEl . autoplay = true ;
290- document . body . appendChild ( audioEl ) ;
344+ const audioEl = document . querySelector ( "audio" ) ;
291345
292346 // On receiving remote track
293347 pc . ontrack = ( e ) => audioEl . srcObject = e . streams [ 0 ] ;
348+ // Add the local audio track and reference to its source stream
349+ pc . addTrack ( track , stream ) ;
294350
295- // Get local mic and add to PeerConnection
296- try {
297- const stream = await navigator . mediaDevices . getUserMedia ( { audio : true } ) ;
298- if ( ! stream ) {
299- console . error ( "Failed to get local stream" ) ;
300- return ;
301- }
302- track = stream . getTracks ( ) [ 0 ] ;
303- pc . addTrack ( track , stream ) ;
304- } catch ( err ) {
305- console . error ( "Error accessing mic:" , err ) ;
306- appendOrUpdateLog ( "Mic access error. Check permissions." , "system-message" ) ;
307- toggleSessionButtons ( false ) ;
308- }
309-
310- // Create data channel once
351+ // Create data channel
311352 dc = pc . createDataChannel ( "oai-events" ) ;
312- dc . addEventListener ( "message" , handleMessage ) ;
313353
314354 // Send session instructions upon opening the data channel
315355 dc . addEventListener ( "open" , ( ) => {
@@ -342,9 +382,13 @@ <h1>Chat with GPT-4 Realtime with WebRTC</h1>
342382 appendOrUpdateLog ( "Session started." , "system-message" ) ;
343383 } ) ;
344384
385+ // Handle incoming messages from the server
386+ dc . addEventListener ( "message" , handleMessage ) ;
387+
345388 // implicit setLocalDescription style
346389 await pc . setLocalDescription ( ) ;
347390
391+ // Create answer
348392 const baseUrl = "https://api.openai.com/v1/realtime" ;
349393 const sdpResp = await fetch ( `${ baseUrl } ?model=${ model } ` , {
350394 method : "POST" ,
@@ -377,13 +421,13 @@ <h1>Chat with GPT-4 Realtime with WebRTC</h1>
377421 * End the current session, optionally sending instructions for a closing message.
378422 * @param {string } instructions - Instructions for the closing message.
379423 */
380- async function endSession ( instructions = "" ) {
424+ async function endSession ( instructions = endInstructionsEl . value || defaultEndInstructions ) {
381425 console . log ( "Ending session..." ) ;
382426 const message = {
383427 type : "response.create" ,
384428 response : {
385429 modalities : [ "text" , "audio" ] ,
386- instructions : instructions || endInstructionsEl . value || defaultEndInstructions ,
430+ instructions : instructions ,
387431 temperature : parseFloat ( temperatureEl . value ) ,
388432 max_output_tokens : 200
389433 }
@@ -436,6 +480,9 @@ <h1>Chat with GPT-4 Realtime with WebRTC</h1>
436480 const autoSaveStatusEl = $ ( "autoSaveStatus" ) ;
437481 const settingsButtonEl = $ ( "settingsButton" ) ;
438482 const settingsFormEl = $ ( "settingsForm" ) ;
483+ const remoteAudio = $ ( 'remoteAudio' ) ;
484+ const muteButton = $ ( 'muteButton' ) ;
485+ const volumeSlider = $ ( 'volumeSlider' ) ;
439486 const logEl = $ ( "log" ) ;
440487 const inputContainerEl = $ ( "inputContainer" ) ;
441488 const textInputEl = $ ( "textInput" ) ;
@@ -450,6 +497,19 @@ <h1>Chat with GPT-4 Realtime with WebRTC</h1>
450497 const defaultVoice = "alloy" ;
451498
452499
500+ // Volume controls
501+ volumeSlider . addEventListener ( 'input' , ( e ) => {
502+ remoteAudio . volume = e . target . value ;
503+ } ) ;
504+
505+ let isMuted = false ;
506+ muteButton . addEventListener ( 'click' , ( ) => {
507+ isMuted = ! isMuted ;
508+ remoteAudio . muted = isMuted ;
509+ muteButton . textContent = isMuted ? '🔇' : '🔊' ;
510+ } ) ;
511+
512+
453513 /**
454514 * Toggle the session control buttons based on the session state.
455515 * @param {boolean } isSessionActive - Whether a session is currently active.
0 commit comments