diff --git a/package.json b/package.json index 061fe85..fd89321 100644 --- a/package.json +++ b/package.json @@ -43,9 +43,8 @@ }, "dependencies": { "axios": "^0.17.1", - "he": "^1.1.1", "lodash": "^4.17.4", - "striptags": "^3.1.0" + "xml2json": "^0.11.2" }, "ava": { "babel": "inherit", diff --git a/src/index.js b/src/index.js index 37a4010..70ad636 100644 --- a/src/index.js +++ b/src/index.js @@ -1,9 +1,47 @@ /* @flow */ -import he from 'he'; import axios from 'axios'; import { find } from 'lodash'; -import striptags from 'striptags'; +import parser from 'xml2json'; + +const renameTextProp = ({ $t, ...caption }) => ({ ...caption, text: $t }); + +async function alternativeCaptionsRetrieval(videoID, lang) { + const availableCaptions = await axios.get( + `http://video.google.com/timedtext?v=${videoID}&type=list` + ); + const availableCaptionsParsed = parser.toJson(availableCaptions.data, { + object: true, + }); + const availableLanguages = []; + try { + availableCaptionsParsed.transcript_list.track.forEach(track => + availableLanguages.push(track.lang_code) + ); + } catch (error) { + throw new Error(`Could not find captions for video: ${videoID}`); + } + if (!availableLanguages.includes(lang)) { + throw new Error( + `Could not find ${lang} captions. Avaliable languages: ${availableLanguages.join( + ', ' + )}.` + ); + } + const captionsForLang = await axios.get( + `http://video.google.com/timedtext?v=${videoID}&lang=${lang}` + ); + const captionsForLangParsed = parser.toJson(captionsForLang.data, { + object: true, + }); + let result = []; + try { + result = captionsForLangParsed.transcript.text.map(renameTextProp); + } catch (error) { + throw new Error(`Could not find captions for video: ${videoID}`); + } + return result; +} export async function getSubtitles({ videoID, @@ -18,9 +56,13 @@ export async function getSubtitles({ const decodedData = decodeURIComponent(data); - // * ensure we have access to captions data - if (!decodedData.includes('captionTracks')) - throw new Error(`Could not find captions for video: ${videoID}`); + if (!decodedData.includes('captionTracks')) { + const alternativeCaptions = await alternativeCaptionsRetrieval( + videoID, + lang + ); + return alternativeCaptions; + } const regex = /({"captionTracks":.*isTranslatable":(true|false)}])/; const [match] = regex.exec(decodedData); @@ -40,32 +82,13 @@ export async function getSubtitles({ throw new Error(`Could not find ${lang} captions for ${videoID}`); const { data: transcript } = await axios.get(subtitle.baseUrl); - const lines = transcript - .replace('', '') - .replace('', '') - .split('') - .filter(line => line && line.trim()) - .map(line => { - const startRegex = /start="([\d.]+)"/; - const durRegex = /dur="([\d.]+)"/; - - const [, start] = startRegex.exec(line); - const [, dur] = durRegex.exec(line); - - const htmlText = line - .replace(//, '') - .replace(/&/gi, '&') - .replace(/<\/?[^>]+(>|$)/g, ''); - - const decodedText = he.decode(htmlText); - const text = striptags(decodedText); - - return { - start, - dur, - text, - }; - }); + const transcriptParsed = parser.toJson(transcript, { object: true }); + let lines = []; + try { + lines = transcriptParsed.transcript.text.map(renameTextProp); + } catch (error) { + throw new Error(`Could not find captions for ${videoID}`); + } return lines; } diff --git a/test/index.test.js b/test/index.test.js index e9aa96e..fd41499 100644 --- a/test/index.test.js +++ b/test/index.test.js @@ -14,3 +14,13 @@ test('Extract passive income video', async t => { const subtitles = await getSubtitles({ videoID: 'JueUvj6X3DA' }); t.deepEqual('creating passive income takes work but', subtitles[0].text); }); + +test('Try capturing subtitles not listed in captionTracks', async t => { + const subtitles = await getSubtitles({ videoID: '62xdACKITrE' }); + t.deepEqual(subtitles[0], { + start: '11.8', + dur: '2.9', + text: + 'Ein Flugzeug liegt im Abendwind\nA plane is flying on the evening winds', + }); +}); diff --git a/yarn.lock b/yarn.lock index 2738e06..ac2b2e7 100644 --- a/yarn.lock +++ b/yarn.lock @@ -1141,6 +1141,13 @@ binary-extensions@^1.0.0: buffers "~0.1.1" chainsaw "~0.1.0" +bindings@^1.5.0: + version "1.5.0" + resolved "https://registry.yarnpkg.com/bindings/-/bindings-1.5.0.tgz#10353c9e945334bc0511a6d90b38fbc7c9c504df" + integrity sha512-p2q/t/mhvuOj/UeLlV6566GD/guowlr0hHxClI0W9m7MWYkL1F0hLo+0Aexs9HSPCtR1SXQ0TD3MMKrXZajbiQ== + dependencies: + file-uri-to-path "1.0.0" + block-stream@*: version "0.0.9" resolved "https://registry.yarnpkg.com/block-stream/-/block-stream-0.0.9.tgz#13ebfe778a03205cfe03751481ebb4b3300c126a" @@ -1933,6 +1940,11 @@ file-entry-cache@^2.0.0: flat-cache "^1.2.1" object-assign "^4.0.1" +file-uri-to-path@1.0.0: + version "1.0.0" + resolved "https://registry.yarnpkg.com/file-uri-to-path/-/file-uri-to-path-1.0.0.tgz#553a7b8446ff6f684359c445f1e37a05dacc33dd" + integrity sha512-0Zt+s3L7Vf1biwWZ29aARiVYLx7iMGnEUl9x33fbB/j3jR81u/O2LbqK+Bm1CDSNDKVtJ/YjwY7TUd5SkeLQLw== + filename-regex@^2.0.0: version "2.0.1" resolved "https://registry.yarnpkg.com/filename-regex/-/filename-regex-2.0.1.tgz#c1c4b9bee3e09725ddb106b75c1e301fe2f18b26" @@ -2334,10 +2346,6 @@ hawk@~6.0.2: hoek "4.x.x" sntp "2.x.x" -he@^1.1.1: - version "1.1.1" - resolved "https://registry.yarnpkg.com/he/-/he-1.1.1.tgz#93410fd21b009735151f8868c2f271f3427e23fd" - hoek@2.x.x: version "2.16.3" resolved "https://registry.yarnpkg.com/hoek/-/hoek-2.16.3.tgz#20bb7403d3cea398e91dc4710a8ff1b8274a25ed" @@ -2346,6 +2354,21 @@ hoek@4.x.x: version "4.2.0" resolved "https://registry.yarnpkg.com/hoek/-/hoek-4.2.0.tgz#72d9d0754f7fe25ca2d01ad8f8f9a9449a89526d" +hoek@5.x.x: + version "5.0.4" + resolved "https://registry.yarnpkg.com/hoek/-/hoek-5.0.4.tgz#0f7fa270a1cafeb364a4b2ddfaa33f864e4157da" + integrity sha512-Alr4ZQgoMlnere5FZJsIyfIjORBqZll5POhDsF4q64dPuJR6rNxXdDxtHSQq8OXRurhmx+PWYEE8bXRROY8h0w== + +hoek@6.x.x: + version "6.1.3" + resolved "https://registry.yarnpkg.com/hoek/-/hoek-6.1.3.tgz#73b7d33952e01fe27a38b0457294b79dd8da242c" + integrity sha512-YXXAAhmF9zpQbC7LEcREFtXfGq5K1fmd+4PHkBq8NUqmzW3G+Dq10bI/i0KucLRwss3YYFQ0fSfoxBZYiGUqtQ== + +hoek@^4.2.1: + version "4.2.1" + resolved "https://registry.yarnpkg.com/hoek/-/hoek-4.2.1.tgz#9634502aa12c445dd5a7c5734b572bb8738aacbb" + integrity sha512-QLg82fGkfnJ/4iy1xZ81/9SIJiq1NGFUMGs6ParyjBZr6jW2Ufj/snDqTHixNlHdPNwN2RLVD0Pi3igeK9+JfA== + home-or-tmp@^2.0.0: version "2.0.0" resolved "https://registry.yarnpkg.com/home-or-tmp/-/home-or-tmp-2.0.0.tgz#e36c3f2d2cae7d746a857e38d18d5f32a7882db8" @@ -2668,6 +2691,13 @@ isarray@1.0.0, isarray@^1.0.0, isarray@~1.0.0: version "1.0.0" resolved "https://registry.yarnpkg.com/isarray/-/isarray-1.0.0.tgz#bb935d48582cba168c06834957a54a3e07124f11" +isemail@3.x.x: + version "3.2.0" + resolved "https://registry.yarnpkg.com/isemail/-/isemail-3.2.0.tgz#59310a021931a9fb06bbb51e155ce0b3f236832c" + integrity sha512-zKqkK+O+dGqevc93KNsbZ/TqTUFd46MwWjYOoMrjIMZ51eU7DtQG3Wmd9SQQT7i7RVnuTPEiYEWHU3MSbxC1Tg== + dependencies: + punycode "2.x.x" + isexe@^2.0.0: version "2.0.0" resolved "https://registry.yarnpkg.com/isexe/-/isexe-2.0.0.tgz#e8fbf374dc556ff8947a10dcb0572d633f2cfa10" @@ -2693,6 +2723,15 @@ jest-docblock@^21.0.0: version "21.2.0" resolved "https://registry.yarnpkg.com/jest-docblock/-/jest-docblock-21.2.0.tgz#51529c3b30d5fd159da60c27ceedc195faf8d414" +joi@^13.1.2: + version "13.7.0" + resolved "https://registry.yarnpkg.com/joi/-/joi-13.7.0.tgz#cfd85ebfe67e8a1900432400b4d03bbd93fb879f" + integrity sha512-xuY5VkHfeOYK3Hdi91ulocfuFopwgbSORmIwzcwHKESQhC7w1kD5jaVSPnqDxS2I8t3RZ9omCKAxNwXN5zG1/Q== + dependencies: + hoek "5.x.x" + isemail "3.x.x" + topo "3.x.x" + js-string-escape@^1.0.1: version "1.0.1" resolved "https://registry.yarnpkg.com/js-string-escape/-/js-string-escape-1.0.1.tgz#e2625badbc0d67c7533e9edc1068c587ae4137ef" @@ -3059,6 +3098,11 @@ mute-stream@0.0.7: version "0.0.7" resolved "https://registry.yarnpkg.com/mute-stream/-/mute-stream-0.0.7.tgz#3075ce93bc21b8fab43e1bc4da7e8115ed1e7bab" +nan@^2.13.2: + version "2.14.0" + resolved "https://registry.yarnpkg.com/nan/-/nan-2.14.0.tgz#7818f722027b2459a86f0295d434d1fc2336c52c" + integrity sha512-INOFj37C7k3AfaNTtX8RhsTw7qRy7eLET14cROi9+5HAVbbHuIWUHEauBv5qT4Av2tWasiTY1Jw6puUNqRJXQg== + nan@^2.3.0: version "2.8.0" resolved "https://registry.yarnpkg.com/nan/-/nan-2.8.0.tgz#ed715f3fe9de02b57a5e6252d90a96675e1f085a" @@ -3071,6 +3115,14 @@ natural-compare@^1.4.0: version "1.4.0" resolved "https://registry.yarnpkg.com/natural-compare/-/natural-compare-1.4.0.tgz#4abebfeed7541f2c27acfb29bdbbd15c8d5ba4f7" +node-expat@^2.3.15: + version "2.3.18" + resolved "https://registry.yarnpkg.com/node-expat/-/node-expat-2.3.18.tgz#d9e6949cecda15e131f14259b73dc7b9ed7bc560" + integrity sha512-9dIrDxXePa9HSn+hhlAg1wXkvqOjxefEbMclGxk2cEnq/Y3U7Qo5HNNqeo3fQ4bVmLhcdt3YN1TZy7WMZy4MHw== + dependencies: + bindings "^1.5.0" + nan "^2.13.2" + node-pre-gyp@^0.6.39: version "0.6.39" resolved "https://registry.yarnpkg.com/node-pre-gyp/-/node-pre-gyp-0.6.39.tgz#c00e96860b23c0e1420ac7befc5044e1d78d8649" @@ -3451,6 +3503,11 @@ pseudomap@^1.0.2: setimmediate ">= 1.0.2 < 2" slice-stream ">= 1.0.0 < 2" +punycode@2.x.x: + version "2.1.1" + resolved "https://registry.yarnpkg.com/punycode/-/punycode-2.1.1.tgz#b58b010ac40c22c5657616c8d2c2c02c7bf479ec" + integrity sha512-XRsRjdf+j5ml+y/6GKHPZbrF/8p2Yga0JPtdqTIY2Xe5ohJPD9saDJJLPvp9+NSBprVvevdXZybnj2cv8OEd0A== + punycode@^1.4.1: version "1.4.1" resolved "https://registry.yarnpkg.com/punycode/-/punycode-1.4.1.tgz#c0d5a63b2718800ad8e1eb0fa5269c84dd41845e" @@ -3994,10 +4051,6 @@ strip-json-comments@~2.0.1: version "2.0.1" resolved "https://registry.yarnpkg.com/strip-json-comments/-/strip-json-comments-2.0.1.tgz#3c531942e908c2697c0ec344858c286c7ca0a60a" -striptags@^3.1.0: - version "3.1.0" - resolved "https://registry.yarnpkg.com/striptags/-/striptags-3.1.0.tgz#763e534338d9cf542f004a4b1eb099e32d295e44" - supertap@^1.0.0: version "1.0.0" resolved "https://registry.yarnpkg.com/supertap/-/supertap-1.0.0.tgz#bd9751c7fafd68c68cf8222a29892206a119fa9e" @@ -4107,6 +4160,13 @@ to-fast-properties@^2.0.0: version "2.0.0" resolved "https://registry.yarnpkg.com/to-fast-properties/-/to-fast-properties-2.0.0.tgz#dc5e698cbd079265bc73e0377681a4e4e83f616e" +topo@3.x.x: + version "3.0.3" + resolved "https://registry.yarnpkg.com/topo/-/topo-3.0.3.tgz#d5a67fb2e69307ebeeb08402ec2a2a6f5f7ad95c" + integrity sha512-IgpPtvD4kjrJ7CRA3ov2FhWQADwv+Tdqbsf1ZnPUSAtCJ9e1Z44MmoSGDXGk4IppoZA7jd/QRkNddlLJWlUZsQ== + dependencies: + hoek "6.x.x" + tough-cookie@~2.3.0, tough-cookie@~2.3.3: version "2.3.3" resolved "https://registry.yarnpkg.com/tough-cookie/-/tough-cookie-2.3.3.tgz#0b618a5565b6dea90bf3425d04d55edc475a7561" @@ -4341,6 +4401,15 @@ xdg-basedir@^3.0.0: version "3.0.0" resolved "https://registry.yarnpkg.com/xdg-basedir/-/xdg-basedir-3.0.0.tgz#496b2cc109eca8dbacfe2dc72b603c17c5870ad4" +xml2json@^0.11.2: + version "0.11.2" + resolved "https://registry.yarnpkg.com/xml2json/-/xml2json-0.11.2.tgz#70ddd234fd7818312cc58455cab8457b5bcc7c52" + integrity sha512-ZJpHpPOL0T5lOvAHMnWm59iQOPqNtam5t2TMUllWZ1k5Wm8L5YyvQnkeaVnRKCvDwY5EumqXWyOjjMdQVz272A== + dependencies: + hoek "^4.2.1" + joi "^13.1.2" + node-expat "^2.3.15" + xtend@^4.0.0, xtend@~4.0.1: version "4.0.1" resolved "https://registry.yarnpkg.com/xtend/-/xtend-4.0.1.tgz#a5c6d532be656e23db820efb943a1f04998d63af"