Skip to content

Commit 823d0fc

Browse files
remote audio controls; more simplification
1 parent 79d7ccf commit 823d0fc

File tree

1 file changed

+91
-31
lines changed

1 file changed

+91
-31
lines changed

index.html

Lines changed: 91 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,34 @@
7575
color: #f9f9f9;
7676
border: 1px solid #666;
7777
}
78+
79+
.audio-container {
80+
margin: 10px 0;
81+
padding: 8px;
82+
background: #333;
83+
border: 1px solid #444;
84+
border-radius: 4px;
85+
}
86+
87+
.volume-control {
88+
display: flex;
89+
align-items: center;
90+
gap: 8px;
91+
}
92+
93+
#volumeSlider {
94+
width: 100px;
95+
accent-color: #555;
96+
}
97+
98+
#muteButton {
99+
padding: 4px 8px;
100+
margin-right: 8px;
101+
background: #444;
102+
border: 1px solid #555;
103+
border-radius: 4px;
104+
cursor: pointer;
105+
}
78106
</style>
79107
</head>
80108

@@ -123,6 +151,16 @@ <h1>Chat with GPT-4 Realtime with WebRTC</h1>
123151
<span id="autoSaveStatus"></span>
124152
</div>
125153

154+
<!-- Audio element for remote audio -->
155+
<div class="audio-container">
156+
<audio id="remoteAudio" autoplay></audio>
157+
<div class="volume-control">
158+
<label for="volumeSlider">Assistant Audio Control: </label>
159+
<button id="muteButton">🔊</button>
160+
<input type="range" id="volumeSlider" min="0" max="1" step="0.1" value="1">
161+
</div>
162+
</div>
163+
126164
<!-- Log container with an input field at the bottom -->
127165
<div id="log">
128166
<div id="inputContainer">
@@ -143,12 +181,16 @@ <h1>Chat with GPT-4 Realtime with WebRTC</h1>
143181
-->
144182
<script id="rtc-logic">
145183
// Globals
146-
let pc; // RTCPeerConnection
147-
let track; // Local audio track
148-
let dc; // Data channel
184+
let pc; // RTCPeerConnection
185+
let track; // Local audio track
186+
let dc; // Data channel
149187
const assistantResults = {}; // Track interim/final transcripts
150188
const userMessages = {}; // Track user messages per item ID
151189

190+
// Expose to console for debugging & fun
191+
window.pc = pc;
192+
window.track = track;
193+
window.dc = dc;
152194

153195
// Model & function definitions
154196
const model = "gpt-4o-mini-realtime-preview";
@@ -174,7 +216,6 @@ <h1>Chat with GPT-4 Realtime with WebRTC</h1>
174216
const payload = {
175217
model: model,
176218
voice: voiceEl.value,
177-
input_audio_format: "pcm16",
178219
input_audio_transcription: {model: "whisper-1"},
179220
instructions: sessionInstructionsEl.value
180221
};
@@ -244,9 +285,6 @@ <h1>Chat with GPT-4 Realtime with WebRTC</h1>
244285
if (name === "end_session") {
245286
console.log("Ending session based on user request");
246287
endSession();
247-
} else if (name === "close_tab") {
248-
console.log("User confirmed closing tab");
249-
endSession("Tell the user they will need to navigate back to this page to start a new chat.");
250288
}
251289
break;
252290

@@ -268,15 +306,33 @@ <h1>Chat with GPT-4 Realtime with WebRTC</h1>
268306
if (!apiKey || apiKey === "Your OPENAI_API_KEY from platform.openai.com") {
269307
console.error("No OpenAI API Key provided");
270308
appendOrUpdateLog("Error: No OpenAI API Key provided.", "system-message");
309+
toggleSessionButtons(false);
271310
return;
272311
}
273312

274313
// An ephemeral key is required to start a session
275314
// Usually this should be requested from your server to avoid exposing the OPENAI_API_KEY
276315
const ephemeralKey = await fetchEphemeralKey(apiKey);
277316
if (!ephemeralKey || ephemeralKey === "error") {
278-
toggleSessionButtons(false);
279-
return;
317+
toggleSessionButtons(false);
318+
return;
319+
}
320+
321+
// Capture the local mic
322+
// technically optional for the API, but required in this example
323+
let stream;
324+
try {
325+
stream = await navigator.mediaDevices.getUserMedia({audio: true});
326+
if (!stream) {
327+
console.error("Failed to get local stream");
328+
return;
329+
}
330+
track = stream.getTracks()[0];
331+
} catch (err) {
332+
console.error("Error accessing mic:", err);
333+
appendOrUpdateLog("Mic access error. Check permissions.", "system-message");
334+
toggleSessionButtons(false);
335+
return;
280336
}
281337

282338
// Start the WebRTC session
@@ -285,31 +341,15 @@ <h1>Chat with GPT-4 Realtime with WebRTC</h1>
285341
pc = new RTCPeerConnection();
286342

287343
// Create an audio element for the remote track
288-
const audioEl = document.createElement("audio");
289-
audioEl.autoplay = true;
290-
document.body.appendChild(audioEl);
344+
const audioEl = document.querySelector("audio");
291345

292346
// On receiving remote track
293347
pc.ontrack = (e) => audioEl.srcObject = e.streams[0];
348+
// Add the local audio track and reference to its source stream
349+
pc.addTrack(track, stream);
294350

295-
// Get local mic and add to PeerConnection
296-
try {
297-
const stream = await navigator.mediaDevices.getUserMedia({audio: true});
298-
if (!stream) {
299-
console.error("Failed to get local stream");
300-
return;
301-
}
302-
track = stream.getTracks()[0];
303-
pc.addTrack(track, stream);
304-
} catch (err) {
305-
console.error("Error accessing mic:", err);
306-
appendOrUpdateLog("Mic access error. Check permissions.", "system-message");
307-
toggleSessionButtons(false);
308-
}
309-
310-
// Create data channel once
351+
// Create data channel
311352
dc = pc.createDataChannel("oai-events");
312-
dc.addEventListener("message", handleMessage);
313353

314354
// Send session instructions upon opening the data channel
315355
dc.addEventListener("open", () => {
@@ -342,9 +382,13 @@ <h1>Chat with GPT-4 Realtime with WebRTC</h1>
342382
appendOrUpdateLog("Session started.", "system-message");
343383
});
344384

385+
// Handle incoming messages from the server
386+
dc.addEventListener("message", handleMessage);
387+
345388
// implicit setLocalDescription style
346389
await pc.setLocalDescription();
347390

391+
// Create answer
348392
const baseUrl = "https://api.openai.com/v1/realtime";
349393
const sdpResp = await fetch(`${baseUrl}?model=${model}`, {
350394
method: "POST",
@@ -377,13 +421,13 @@ <h1>Chat with GPT-4 Realtime with WebRTC</h1>
377421
* End the current session, optionally sending instructions for a closing message.
378422
* @param {string} instructions - Instructions for the closing message.
379423
*/
380-
async function endSession(instructions = "") {
424+
async function endSession(instructions = endInstructionsEl.value || defaultEndInstructions) {
381425
console.log("Ending session...");
382426
const message = {
383427
type: "response.create",
384428
response: {
385429
modalities: ["text", "audio"],
386-
instructions: instructions || endInstructionsEl.value || defaultEndInstructions,
430+
instructions: instructions,
387431
temperature: parseFloat(temperatureEl.value),
388432
max_output_tokens: 200
389433
}
@@ -436,6 +480,9 @@ <h1>Chat with GPT-4 Realtime with WebRTC</h1>
436480
const autoSaveStatusEl = $("autoSaveStatus");
437481
const settingsButtonEl = $("settingsButton");
438482
const settingsFormEl = $("settingsForm");
483+
const remoteAudio = $('remoteAudio');
484+
const muteButton = $('muteButton');
485+
const volumeSlider = $('volumeSlider');
439486
const logEl = $("log");
440487
const inputContainerEl = $("inputContainer");
441488
const textInputEl = $("textInput");
@@ -450,6 +497,19 @@ <h1>Chat with GPT-4 Realtime with WebRTC</h1>
450497
const defaultVoice = "alloy";
451498

452499

500+
// Volume controls
501+
volumeSlider.addEventListener('input', (e) => {
502+
remoteAudio.volume = e.target.value;
503+
});
504+
505+
let isMuted = false;
506+
muteButton.addEventListener('click', () => {
507+
isMuted = !isMuted;
508+
remoteAudio.muted = isMuted;
509+
muteButton.textContent = isMuted ? '🔇' : '🔊';
510+
});
511+
512+
453513
/**
454514
* Toggle the session control buttons based on the session state.
455515
* @param {boolean} isSessionActive - Whether a session is currently active.

0 commit comments

Comments
 (0)