7676 border : 1px solid # 666 ;
7777 }
7878
79+ /* Remote audio control styling */
7980 .audio-container {
8081 margin : 10px 0 ;
8182 padding : 8px ;
@@ -122,7 +123,7 @@ <h1>Chat with GPT-4 Realtime with WebRTC</h1>
122123 value ="Your OPENAI_API_KEY from platform.openai.com ">
123124 < br >
124125 < label for ="temperature "> Temperature: </ label >
125- < input id ="temperature " type ="number " step ="0.1 " min ="0 " max ="1 " placeholder ="Temperature " value ="0.7 "
126+ < input id ="temperature " type ="number " step ="0.1 " min ="0 " max ="2 " placeholder ="Temperature " value ="1.0 "
126127 style ="width: 60px ">
127128 < br >
128129 < label for ="voice "> Select Voice: </ label >
@@ -215,9 +216,11 @@ <h1>Chat with GPT-4 Realtime with WebRTC</h1>
215216 } ;
216217 const payload = {
217218 model : model ,
218- voice : voiceEl . value ,
219- input_audio_transcription : { model : "whisper-1" } ,
220- instructions : sessionInstructionsEl . value
219+ // expires_at: Math.floor(Date.now() / 1000) + 60, // #Fail - unknown parameter!
220+ // load these later
221+ // voice: voiceEl.value,
222+ // input_audio_transcription: {model: "whisper-1"},
223+ // instructions: sessionInstructionsEl.value
221224 } ;
222225
223226 const response = await fetch ( "https://api.openai.com/v1/realtime/sessions" , {
@@ -231,19 +234,59 @@ <h1>Chat with GPT-4 Realtime with WebRTC</h1>
231234 return null ;
232235 }
233236 const data = await response . json ( ) ;
234- const key = data ?. client_secret ?. value ;
235- if ( ! key ) {
237+ const token = data ?. client_secret ?. value ;
238+ if ( ! token ) {
236239 console . error ( "Invalid response format:" , data ) ;
237240 return null ;
238241 }
239- console . log ( "Received ephemeral key:" , key ) ;
240- return key ;
242+ // For debugging - don't make this easy to abuse
243+ console . log ( `Received ephemeral key: ${ token } will expire at ${ new Date ( data ?. client_secret ?. expires_at * 1000 )
244+ . toLocaleString ( ) } `, data ) ;
245+ return token ;
241246 } catch ( error ) {
242247 console . error ( "Error fetching ephemeral key:" , error ) ;
243248 return error ;
244249 }
245250 }
246251
252+
253+ /**
254+ * Send initial session instructions and start message.
255+ * This is called when the data channel is opened.
256+ * @returns {void }
257+ */
258+ function sessionStartMessages ( ) {
259+ const sessionInstruct = localStorage . getItem ( "sessionInstructions" ) || defaultSessionInstructions ;
260+ const startInstruct = localStorage . getItem ( "startInstructions" ) || defaultStartInstructions ;
261+ const temperature = parseFloat ( localStorage . getItem ( "temperature" ) ) || defaultTemperature ;
262+
263+ // Update the session
264+ const systemMessage = {
265+ type : "session.update" ,
266+ session : {
267+ instructions : sessionInstruct ,
268+ voice : voiceEl . value ,
269+ tools : gptFunctions ,
270+ tool_choice : "auto" ,
271+ input_audio_transcription : { model : "whisper-1" } ,
272+ temperature : temperature ,
273+ }
274+ } ;
275+ dc . send ( JSON . stringify ( systemMessage ) ) ;
276+
277+ // Start instructions
278+ const startMessage = {
279+ type : "response.create" ,
280+ response : {
281+ modalities : [ "text" , "audio" ] ,
282+ instructions : startInstruct ,
283+ max_output_tokens : 100
284+ }
285+ } ;
286+ dc . send ( JSON . stringify ( startMessage ) ) ;
287+ appendOrUpdateLog ( "Session started." , "system-message" ) ;
288+ }
289+
247290 /**
248291 * Handle incoming DataChannel messages from the GPT server.
249292 */
@@ -312,8 +355,8 @@ <h1>Chat with GPT-4 Realtime with WebRTC</h1>
312355
313356 // An ephemeral key is required to start a session
314357 // Usually this should be requested from your server to avoid exposing the OPENAI_API_KEY
315- const ephemeralKey = await fetchEphemeralKey ( apiKey ) ;
316- if ( ! ephemeralKey || ephemeralKey === "error" ) {
358+ const ephemeralToken = await fetchEphemeralKey ( apiKey ) ;
359+ if ( ! ephemeralToken || ephemeralToken === "error" ) {
317360 toggleSessionButtons ( false ) ;
318361 return ;
319362 }
@@ -327,7 +370,7 @@ <h1>Chat with GPT-4 Realtime with WebRTC</h1>
327370 console . error ( "Failed to get local stream" ) ;
328371 return ;
329372 }
330- track = stream . getTracks ( ) [ 0 ] ;
373+ [ track ] = stream . getAudioTracks ( ) ;
331374 } catch ( err ) {
332375 console . error ( "Error accessing mic:" , err ) ;
333376 appendOrUpdateLog ( "Mic access error. Check permissions." , "system-message" ) ;
@@ -340,47 +383,16 @@ <h1>Chat with GPT-4 Realtime with WebRTC</h1>
340383 // Create PeerConnection
341384 pc = new RTCPeerConnection ( ) ;
342385
343- // Create an audio element for the remote track
344- const audioEl = document . querySelector ( "audio" ) ;
345-
346386 // On receiving remote track
347- pc . ontrack = ( e ) => audioEl . srcObject = e . streams [ 0 ] ;
387+ pc . ontrack = ( e ) => remoteAudioEl . srcObject = e . streams [ 0 ] ;
348388 // Add the local audio track and reference to its source stream
349389 pc . addTrack ( track , stream ) ;
350390
351391 // Create data channel
352- dc = pc . createDataChannel ( "oai-events " ) ;
392+ dc = pc . createDataChannel ( "oai" ) ;
353393
354394 // Send session instructions upon opening the data channel
355- dc . addEventListener ( "open" , ( ) => {
356- const sessionInstruct = localStorage . getItem ( "sessionInstructions" ) || "You are a friendly assistant" ;
357- const startInstruct = localStorage . getItem ( "startInstructions" ) || "Greet the user and ask how you can help" ;
358- const temperature = parseFloat ( localStorage . getItem ( "temperature" ) ) || 0.7 ;
359-
360- // Update the session
361- const systemMessage = {
362- type : "session.update" ,
363- session : {
364- instructions : sessionInstruct ,
365- tools : gptFunctions ,
366- tool_choice : "auto"
367- }
368- } ;
369- dc . send ( JSON . stringify ( systemMessage ) ) ;
370-
371- // Start instructions
372- const startMessage = {
373- type : "response.create" ,
374- response : {
375- modalities : [ "text" , "audio" ] ,
376- instructions : startInstruct ,
377- temperature : temperature ,
378- max_output_tokens : 100
379- }
380- } ;
381- dc . send ( JSON . stringify ( startMessage ) ) ;
382- appendOrUpdateLog ( "Session started." , "system-message" ) ;
383- } ) ;
395+ dc . addEventListener ( "open" , ( ) => sessionStartMessages ( ) ) ;
384396
385397 // Handle incoming messages from the server
386398 dc . addEventListener ( "message" , handleMessage ) ;
@@ -390,22 +402,32 @@ <h1>Chat with GPT-4 Realtime with WebRTC</h1>
390402
391403 // Create answer
392404 const baseUrl = "https://api.openai.com/v1/realtime" ;
393- const sdpResp = await fetch ( `${ baseUrl } ?model=${ model } ` , {
405+ const response = await fetch ( `${ baseUrl } ?model=${ model } ` , {
394406 method : "POST" ,
395407 body : pc . localDescription . sdp ,
396408 headers : {
397- Authorization : `Bearer ${ ephemeralKey } ` ,
409+ Authorization : `Bearer ${ ephemeralToken } ` ,
398410 "Content-Type" : "application/sdp"
399411 } ,
400412 } ) ;
401- if ( ! sdpResp . ok ) {
402- console . error ( "Failed to fetch SDP answer:" , await sdpResp . text ( ) ) ;
413+ if ( ! response . ok ) {
414+ console . error ( "Failed to fetch SDP answer:" , await response . text ( ) ) ;
403415 }
404- const answer = { type : "answer" , sdp : await sdpResp . text ( ) } ;
416+ const answer = { type : "answer" , sdp : await response . text ( ) } ;
405417 await pc . setRemoteDescription ( answer ) ;
406418
407- toggleSessionButtons ( true ) ;
419+ // Wait for connection to be established before proceeding
420+ await new Promise ( ( resolve , reject ) => {
421+ const timeout = setTimeout ( ( ) => reject ( `Connection timeout. Current state: ${ pc . connectionState } ` ) , 10_000 ) ;
422+ pc . addEventListener ( "connectionstatechange" , ( ) => {
423+ if ( pc . connectionState === "connected" ) {
424+ clearTimeout ( timeout ) ;
425+ resolve ( ) ;
426+ }
427+ } ) ;
428+ } ) ;
408429
430+ toggleSessionButtons ( true ) ;
409431 console . log ( "Realtime session started!" ) ;
410432 } catch ( err ) {
411433 console . error ( "Error starting session:" , err ) ;
@@ -423,16 +445,28 @@ <h1>Chat with GPT-4 Realtime with WebRTC</h1>
423445 */
424446 async function endSession ( instructions = endInstructionsEl . value || defaultEndInstructions ) {
425447 console . log ( "Ending session..." ) ;
426- const message = {
427- type : "response.create" ,
428- response : {
429- modalities : [ "text" , "audio" ] ,
430- instructions : instructions ,
431- temperature : parseFloat ( temperatureEl . value ) ,
432- max_output_tokens : 200
433- }
434- } ;
435- if ( dc && dc . readyState === "open" ) {
448+
449+ if ( dc ?. readyState === "open" ) {
450+ // Close after the final message
451+ dc . addEventListener ( "message" , ( event ) => {
452+ const message = JSON . parse ( event . data ) ;
453+ if ( message . type === "output_audio_buffer.stopped" ) {
454+ pc . close ( ) ;
455+
456+ console . log ( "Session ended." ) ;
457+ appendOrUpdateLog ( "Session ended." , "system-message" ) ;
458+ toggleSessionButtons ( false ) ;
459+ }
460+ } ) ;
461+
462+ const message = {
463+ type : "response.create" ,
464+ response : {
465+ modalities : [ "text" , "audio" ] ,
466+ instructions : instructions ,
467+ max_output_tokens : 200
468+ }
469+ } ;
436470 dc . send ( JSON . stringify ( message ) ) ;
437471 }
438472
@@ -443,17 +477,6 @@ <h1>Chat with GPT-4 Realtime with WebRTC</h1>
443477
444478 endButtonEl . disabled = true ;
445479
446- // Close after the final message
447- dc . addEventListener ( "message" , ( event ) => {
448- const message = JSON . parse ( event . data ) ;
449- if ( message . type === "output_audio_buffer.stopped" ) {
450- pc . close ( ) ;
451-
452- console . log ( "Session ended." ) ;
453- appendOrUpdateLog ( "Session ended." , "system-message" ) ;
454- toggleSessionButtons ( false ) ;
455- }
456- } ) ;
457480 }
458481</ script >
459482
@@ -480,7 +503,7 @@ <h1>Chat with GPT-4 Realtime with WebRTC</h1>
480503 const autoSaveStatusEl = $ ( "autoSaveStatus" ) ;
481504 const settingsButtonEl = $ ( "settingsButton" ) ;
482505 const settingsFormEl = $ ( "settingsForm" ) ;
483- const remoteAudio = $ ( 'remoteAudio' ) ;
506+ const remoteAudioEl = $ ( 'remoteAudio' ) ;
484507 const muteButton = $ ( 'muteButton' ) ;
485508 const volumeSlider = $ ( 'volumeSlider' ) ;
486509 const logEl = $ ( "log" ) ;
@@ -493,19 +516,18 @@ <h1>Chat with GPT-4 Realtime with WebRTC</h1>
493516 const defaultSessionInstructions = "You are a friendly assistant" ;
494517 const defaultStartInstructions = "Greet the user and ask how you can help" ;
495518 const defaultEndInstructions = "Give a quick good-bye. Sometimes remind the user to press the button to start a new session." ;
496- const defaultTemperature = 0.7 ;
519+ const defaultTemperature = 1.0 ;
497520 const defaultVoice = "alloy" ;
498521
499522
500523 // Volume controls
501- volumeSlider . addEventListener ( 'input' , ( e ) => {
502- remoteAudio . volume = e . target . value ;
503- } ) ;
524+ volumeSlider . addEventListener ( 'input' , ( e ) => remoteAudioEl . volume = e . target . value ) ;
504525
526+ // Mute button
505527 let isMuted = false ;
506528 muteButton . addEventListener ( 'click' , ( ) => {
507529 isMuted = ! isMuted ;
508- remoteAudio . muted = isMuted ;
530+ remoteAudioEl . muted = isMuted ;
509531 muteButton . textContent = isMuted ? '🔇' : '🔊' ;
510532 } ) ;
511533
@@ -523,6 +545,9 @@ <h1>Chat with GPT-4 Realtime with WebRTC</h1>
523545
524546 /**
525547 * Insert a new log entry or update an existing one (by messageId).
548+ * @param {string } message - The message to log.
549+ * @param {string } className - Optional CSS class for styling
550+ * @param {string } messageId - Optional ID to update an existing log entry.
526551 */
527552 function appendOrUpdateLog ( message , className = "" , messageId = null ) {
528553 let logEntry ;
@@ -542,6 +567,8 @@ <h1>Chat with GPT-4 Realtime with WebRTC</h1>
542567
543568 /**
544569 * Load settings from localStorage into UI fields.
570+ * If not set, use default values.
571+ * @returns {void }
545572 */
546573 function loadSettings ( ) {
547574 keyEl . value = localStorage . getItem ( "openaiApiKey" ) || defaultKey ;
@@ -554,6 +581,7 @@ <h1>Chat with GPT-4 Realtime with WebRTC</h1>
554581
555582 /**
556583 * Autosave changed settings to localStorage (debounced by 2s).
584+ * @returns {void }
557585 */
558586 function autoSaveSettings ( ) {
559587 const settings = {
@@ -573,6 +601,7 @@ <h1>Chat with GPT-4 Realtime with WebRTC</h1>
573601
574602 /**
575603 * Reset settings and re-save them to localStorage.
604+ * @returns {void }
576605 */
577606 function resetSettings ( ) {
578607 const settings = {
@@ -591,6 +620,7 @@ <h1>Chat with GPT-4 Realtime with WebRTC</h1>
591620
592621 /**
593622 * Send updated session instructions to the GPT server (if data channel is open).
623+ * @returns {void }
594624 */
595625 function sendNewSettings ( ) {
596626 if ( dc && dc . readyState === "open" ) {
@@ -622,6 +652,7 @@ <h1>Chat with GPT-4 Realtime with WebRTC</h1>
622652
623653 /**
624654 * Send the user's typed message to the GPT server over the data channel.
655+ * @returns {void }
625656 */
626657 function sendText ( ) {
627658 const text = textInputEl . value . trim ( ) ;
@@ -655,13 +686,16 @@ <h1>Chat with GPT-4 Realtime with WebRTC</h1>
655686 await startSession ( ) ;
656687 } ) ;
657688
689+ // End session button - cancel any in-progress response and close the session
658690 endButtonEl . addEventListener ( "click" , async ( ) => {
659691 if ( pc ?. connectionState === "closed" ) {
660692 console . log ( `No session to end. Connection state: ${ pc . connectionState } ` ) ;
661693 appendOrUpdateLog ( "No session to end." , "system-message" ) ;
662694 return ;
663695 }
664696 toggleSessionButtons ( false ) ;
697+ // cancel any in-progress response - not needed for speech input due to turn detection
698+ dc . send ( JSON . stringify ( { type : "response.cancel" } ) ) ;
665699 await endSession ( ) ;
666700 } ) ;
667701
0 commit comments