diff --git a/package.json b/package.json
index 061fe85..fd89321 100644
--- a/package.json
+++ b/package.json
@@ -43,9 +43,8 @@
},
"dependencies": {
"axios": "^0.17.1",
- "he": "^1.1.1",
"lodash": "^4.17.4",
- "striptags": "^3.1.0"
+ "xml2json": "^0.11.2"
},
"ava": {
"babel": "inherit",
diff --git a/src/index.js b/src/index.js
index 37a4010..70ad636 100644
--- a/src/index.js
+++ b/src/index.js
@@ -1,9 +1,47 @@
/* @flow */
-import he from 'he';
import axios from 'axios';
import { find } from 'lodash';
-import striptags from 'striptags';
+import parser from 'xml2json';
+
+const renameTextProp = ({ $t, ...caption }) => ({ ...caption, text: $t });
+
+async function alternativeCaptionsRetrieval(videoID, lang) {
+ const availableCaptions = await axios.get(
+ `http://video.google.com/timedtext?v=${videoID}&type=list`
+ );
+ const availableCaptionsParsed = parser.toJson(availableCaptions.data, {
+ object: true,
+ });
+ const availableLanguages = [];
+ try {
+ availableCaptionsParsed.transcript_list.track.forEach(track =>
+ availableLanguages.push(track.lang_code)
+ );
+ } catch (error) {
+ throw new Error(`Could not find captions for video: ${videoID}`);
+ }
+ if (!availableLanguages.includes(lang)) {
+ throw new Error(
+ `Could not find ${lang} captions. Avaliable languages: ${availableLanguages.join(
+ ', '
+ )}.`
+ );
+ }
+ const captionsForLang = await axios.get(
+ `http://video.google.com/timedtext?v=${videoID}&lang=${lang}`
+ );
+ const captionsForLangParsed = parser.toJson(captionsForLang.data, {
+ object: true,
+ });
+ let result = [];
+ try {
+ result = captionsForLangParsed.transcript.text.map(renameTextProp);
+ } catch (error) {
+ throw new Error(`Could not find captions for video: ${videoID}`);
+ }
+ return result;
+}
export async function getSubtitles({
videoID,
@@ -18,9 +56,13 @@ export async function getSubtitles({
const decodedData = decodeURIComponent(data);
- // * ensure we have access to captions data
- if (!decodedData.includes('captionTracks'))
- throw new Error(`Could not find captions for video: ${videoID}`);
+ if (!decodedData.includes('captionTracks')) {
+ const alternativeCaptions = await alternativeCaptionsRetrieval(
+ videoID,
+ lang
+ );
+ return alternativeCaptions;
+ }
const regex = /({"captionTracks":.*isTranslatable":(true|false)}])/;
const [match] = regex.exec(decodedData);
@@ -40,32 +82,13 @@ export async function getSubtitles({
throw new Error(`Could not find ${lang} captions for ${videoID}`);
const { data: transcript } = await axios.get(subtitle.baseUrl);
- const lines = transcript
- .replace('', '')
- .replace('', '')
- .split('')
- .filter(line => line && line.trim())
- .map(line => {
- const startRegex = /start="([\d.]+)"/;
- const durRegex = /dur="([\d.]+)"/;
-
- const [, start] = startRegex.exec(line);
- const [, dur] = durRegex.exec(line);
-
- const htmlText = line
- .replace(//, '')
- .replace(/&/gi, '&')
- .replace(/<\/?[^>]+(>|$)/g, '');
-
- const decodedText = he.decode(htmlText);
- const text = striptags(decodedText);
-
- return {
- start,
- dur,
- text,
- };
- });
+ const transcriptParsed = parser.toJson(transcript, { object: true });
+ let lines = [];
+ try {
+ lines = transcriptParsed.transcript.text.map(renameTextProp);
+ } catch (error) {
+ throw new Error(`Could not find captions for ${videoID}`);
+ }
return lines;
}
diff --git a/test/index.test.js b/test/index.test.js
index e9aa96e..fd41499 100644
--- a/test/index.test.js
+++ b/test/index.test.js
@@ -14,3 +14,13 @@ test('Extract passive income video', async t => {
const subtitles = await getSubtitles({ videoID: 'JueUvj6X3DA' });
t.deepEqual('creating passive income takes work but', subtitles[0].text);
});
+
+test('Try capturing subtitles not listed in captionTracks', async t => {
+ const subtitles = await getSubtitles({ videoID: '62xdACKITrE' });
+ t.deepEqual(subtitles[0], {
+ start: '11.8',
+ dur: '2.9',
+ text:
+ 'Ein Flugzeug liegt im Abendwind\nA plane is flying on the evening winds',
+ });
+});
diff --git a/yarn.lock b/yarn.lock
index 2738e06..ac2b2e7 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -1141,6 +1141,13 @@ binary-extensions@^1.0.0:
buffers "~0.1.1"
chainsaw "~0.1.0"
+bindings@^1.5.0:
+ version "1.5.0"
+ resolved "https://registry.yarnpkg.com/bindings/-/bindings-1.5.0.tgz#10353c9e945334bc0511a6d90b38fbc7c9c504df"
+ integrity sha512-p2q/t/mhvuOj/UeLlV6566GD/guowlr0hHxClI0W9m7MWYkL1F0hLo+0Aexs9HSPCtR1SXQ0TD3MMKrXZajbiQ==
+ dependencies:
+ file-uri-to-path "1.0.0"
+
block-stream@*:
version "0.0.9"
resolved "https://registry.yarnpkg.com/block-stream/-/block-stream-0.0.9.tgz#13ebfe778a03205cfe03751481ebb4b3300c126a"
@@ -1933,6 +1940,11 @@ file-entry-cache@^2.0.0:
flat-cache "^1.2.1"
object-assign "^4.0.1"
+file-uri-to-path@1.0.0:
+ version "1.0.0"
+ resolved "https://registry.yarnpkg.com/file-uri-to-path/-/file-uri-to-path-1.0.0.tgz#553a7b8446ff6f684359c445f1e37a05dacc33dd"
+ integrity sha512-0Zt+s3L7Vf1biwWZ29aARiVYLx7iMGnEUl9x33fbB/j3jR81u/O2LbqK+Bm1CDSNDKVtJ/YjwY7TUd5SkeLQLw==
+
filename-regex@^2.0.0:
version "2.0.1"
resolved "https://registry.yarnpkg.com/filename-regex/-/filename-regex-2.0.1.tgz#c1c4b9bee3e09725ddb106b75c1e301fe2f18b26"
@@ -2334,10 +2346,6 @@ hawk@~6.0.2:
hoek "4.x.x"
sntp "2.x.x"
-he@^1.1.1:
- version "1.1.1"
- resolved "https://registry.yarnpkg.com/he/-/he-1.1.1.tgz#93410fd21b009735151f8868c2f271f3427e23fd"
-
hoek@2.x.x:
version "2.16.3"
resolved "https://registry.yarnpkg.com/hoek/-/hoek-2.16.3.tgz#20bb7403d3cea398e91dc4710a8ff1b8274a25ed"
@@ -2346,6 +2354,21 @@ hoek@4.x.x:
version "4.2.0"
resolved "https://registry.yarnpkg.com/hoek/-/hoek-4.2.0.tgz#72d9d0754f7fe25ca2d01ad8f8f9a9449a89526d"
+hoek@5.x.x:
+ version "5.0.4"
+ resolved "https://registry.yarnpkg.com/hoek/-/hoek-5.0.4.tgz#0f7fa270a1cafeb364a4b2ddfaa33f864e4157da"
+ integrity sha512-Alr4ZQgoMlnere5FZJsIyfIjORBqZll5POhDsF4q64dPuJR6rNxXdDxtHSQq8OXRurhmx+PWYEE8bXRROY8h0w==
+
+hoek@6.x.x:
+ version "6.1.3"
+ resolved "https://registry.yarnpkg.com/hoek/-/hoek-6.1.3.tgz#73b7d33952e01fe27a38b0457294b79dd8da242c"
+ integrity sha512-YXXAAhmF9zpQbC7LEcREFtXfGq5K1fmd+4PHkBq8NUqmzW3G+Dq10bI/i0KucLRwss3YYFQ0fSfoxBZYiGUqtQ==
+
+hoek@^4.2.1:
+ version "4.2.1"
+ resolved "https://registry.yarnpkg.com/hoek/-/hoek-4.2.1.tgz#9634502aa12c445dd5a7c5734b572bb8738aacbb"
+ integrity sha512-QLg82fGkfnJ/4iy1xZ81/9SIJiq1NGFUMGs6ParyjBZr6jW2Ufj/snDqTHixNlHdPNwN2RLVD0Pi3igeK9+JfA==
+
home-or-tmp@^2.0.0:
version "2.0.0"
resolved "https://registry.yarnpkg.com/home-or-tmp/-/home-or-tmp-2.0.0.tgz#e36c3f2d2cae7d746a857e38d18d5f32a7882db8"
@@ -2668,6 +2691,13 @@ isarray@1.0.0, isarray@^1.0.0, isarray@~1.0.0:
version "1.0.0"
resolved "https://registry.yarnpkg.com/isarray/-/isarray-1.0.0.tgz#bb935d48582cba168c06834957a54a3e07124f11"
+isemail@3.x.x:
+ version "3.2.0"
+ resolved "https://registry.yarnpkg.com/isemail/-/isemail-3.2.0.tgz#59310a021931a9fb06bbb51e155ce0b3f236832c"
+ integrity sha512-zKqkK+O+dGqevc93KNsbZ/TqTUFd46MwWjYOoMrjIMZ51eU7DtQG3Wmd9SQQT7i7RVnuTPEiYEWHU3MSbxC1Tg==
+ dependencies:
+ punycode "2.x.x"
+
isexe@^2.0.0:
version "2.0.0"
resolved "https://registry.yarnpkg.com/isexe/-/isexe-2.0.0.tgz#e8fbf374dc556ff8947a10dcb0572d633f2cfa10"
@@ -2693,6 +2723,15 @@ jest-docblock@^21.0.0:
version "21.2.0"
resolved "https://registry.yarnpkg.com/jest-docblock/-/jest-docblock-21.2.0.tgz#51529c3b30d5fd159da60c27ceedc195faf8d414"
+joi@^13.1.2:
+ version "13.7.0"
+ resolved "https://registry.yarnpkg.com/joi/-/joi-13.7.0.tgz#cfd85ebfe67e8a1900432400b4d03bbd93fb879f"
+ integrity sha512-xuY5VkHfeOYK3Hdi91ulocfuFopwgbSORmIwzcwHKESQhC7w1kD5jaVSPnqDxS2I8t3RZ9omCKAxNwXN5zG1/Q==
+ dependencies:
+ hoek "5.x.x"
+ isemail "3.x.x"
+ topo "3.x.x"
+
js-string-escape@^1.0.1:
version "1.0.1"
resolved "https://registry.yarnpkg.com/js-string-escape/-/js-string-escape-1.0.1.tgz#e2625badbc0d67c7533e9edc1068c587ae4137ef"
@@ -3059,6 +3098,11 @@ mute-stream@0.0.7:
version "0.0.7"
resolved "https://registry.yarnpkg.com/mute-stream/-/mute-stream-0.0.7.tgz#3075ce93bc21b8fab43e1bc4da7e8115ed1e7bab"
+nan@^2.13.2:
+ version "2.14.0"
+ resolved "https://registry.yarnpkg.com/nan/-/nan-2.14.0.tgz#7818f722027b2459a86f0295d434d1fc2336c52c"
+ integrity sha512-INOFj37C7k3AfaNTtX8RhsTw7qRy7eLET14cROi9+5HAVbbHuIWUHEauBv5qT4Av2tWasiTY1Jw6puUNqRJXQg==
+
nan@^2.3.0:
version "2.8.0"
resolved "https://registry.yarnpkg.com/nan/-/nan-2.8.0.tgz#ed715f3fe9de02b57a5e6252d90a96675e1f085a"
@@ -3071,6 +3115,14 @@ natural-compare@^1.4.0:
version "1.4.0"
resolved "https://registry.yarnpkg.com/natural-compare/-/natural-compare-1.4.0.tgz#4abebfeed7541f2c27acfb29bdbbd15c8d5ba4f7"
+node-expat@^2.3.15:
+ version "2.3.18"
+ resolved "https://registry.yarnpkg.com/node-expat/-/node-expat-2.3.18.tgz#d9e6949cecda15e131f14259b73dc7b9ed7bc560"
+ integrity sha512-9dIrDxXePa9HSn+hhlAg1wXkvqOjxefEbMclGxk2cEnq/Y3U7Qo5HNNqeo3fQ4bVmLhcdt3YN1TZy7WMZy4MHw==
+ dependencies:
+ bindings "^1.5.0"
+ nan "^2.13.2"
+
node-pre-gyp@^0.6.39:
version "0.6.39"
resolved "https://registry.yarnpkg.com/node-pre-gyp/-/node-pre-gyp-0.6.39.tgz#c00e96860b23c0e1420ac7befc5044e1d78d8649"
@@ -3451,6 +3503,11 @@ pseudomap@^1.0.2:
setimmediate ">= 1.0.2 < 2"
slice-stream ">= 1.0.0 < 2"
+punycode@2.x.x:
+ version "2.1.1"
+ resolved "https://registry.yarnpkg.com/punycode/-/punycode-2.1.1.tgz#b58b010ac40c22c5657616c8d2c2c02c7bf479ec"
+ integrity sha512-XRsRjdf+j5ml+y/6GKHPZbrF/8p2Yga0JPtdqTIY2Xe5ohJPD9saDJJLPvp9+NSBprVvevdXZybnj2cv8OEd0A==
+
punycode@^1.4.1:
version "1.4.1"
resolved "https://registry.yarnpkg.com/punycode/-/punycode-1.4.1.tgz#c0d5a63b2718800ad8e1eb0fa5269c84dd41845e"
@@ -3994,10 +4051,6 @@ strip-json-comments@~2.0.1:
version "2.0.1"
resolved "https://registry.yarnpkg.com/strip-json-comments/-/strip-json-comments-2.0.1.tgz#3c531942e908c2697c0ec344858c286c7ca0a60a"
-striptags@^3.1.0:
- version "3.1.0"
- resolved "https://registry.yarnpkg.com/striptags/-/striptags-3.1.0.tgz#763e534338d9cf542f004a4b1eb099e32d295e44"
-
supertap@^1.0.0:
version "1.0.0"
resolved "https://registry.yarnpkg.com/supertap/-/supertap-1.0.0.tgz#bd9751c7fafd68c68cf8222a29892206a119fa9e"
@@ -4107,6 +4160,13 @@ to-fast-properties@^2.0.0:
version "2.0.0"
resolved "https://registry.yarnpkg.com/to-fast-properties/-/to-fast-properties-2.0.0.tgz#dc5e698cbd079265bc73e0377681a4e4e83f616e"
+topo@3.x.x:
+ version "3.0.3"
+ resolved "https://registry.yarnpkg.com/topo/-/topo-3.0.3.tgz#d5a67fb2e69307ebeeb08402ec2a2a6f5f7ad95c"
+ integrity sha512-IgpPtvD4kjrJ7CRA3ov2FhWQADwv+Tdqbsf1ZnPUSAtCJ9e1Z44MmoSGDXGk4IppoZA7jd/QRkNddlLJWlUZsQ==
+ dependencies:
+ hoek "6.x.x"
+
tough-cookie@~2.3.0, tough-cookie@~2.3.3:
version "2.3.3"
resolved "https://registry.yarnpkg.com/tough-cookie/-/tough-cookie-2.3.3.tgz#0b618a5565b6dea90bf3425d04d55edc475a7561"
@@ -4341,6 +4401,15 @@ xdg-basedir@^3.0.0:
version "3.0.0"
resolved "https://registry.yarnpkg.com/xdg-basedir/-/xdg-basedir-3.0.0.tgz#496b2cc109eca8dbacfe2dc72b603c17c5870ad4"
+xml2json@^0.11.2:
+ version "0.11.2"
+ resolved "https://registry.yarnpkg.com/xml2json/-/xml2json-0.11.2.tgz#70ddd234fd7818312cc58455cab8457b5bcc7c52"
+ integrity sha512-ZJpHpPOL0T5lOvAHMnWm59iQOPqNtam5t2TMUllWZ1k5Wm8L5YyvQnkeaVnRKCvDwY5EumqXWyOjjMdQVz272A==
+ dependencies:
+ hoek "^4.2.1"
+ joi "^13.1.2"
+ node-expat "^2.3.15"
+
xtend@^4.0.0, xtend@~4.0.1:
version "4.0.1"
resolved "https://registry.yarnpkg.com/xtend/-/xtend-4.0.1.tgz#a5c6d532be656e23db820efb943a1f04998d63af"