diff --git a/bot/job_options_parser.rb b/bot/job_options_parser.rb index b7c96954..76b8fe61 100644 --- a/bot/job_options_parser.rb +++ b/bot/job_options_parser.rb @@ -5,6 +5,7 @@ class JobOptionsParser def initialize @parser = Trollop::Parser.new do opt :no_offsite_links, 'Do not fetch offsite links' + opt :no_cookies, 'Do not use cookies' opt :youtube_dl, 'Use youtube-dl on grabbed pages' opt :ignore_sets, 'Ignore sets to apply', :type => :string opt :pipeline, 'Run job on this pipeline', :type => :string @@ -23,6 +24,7 @@ def parse(str) b[0] = (case b[0] when '--ignoresets','--ignore_sets','--ignoreset','--ignore-set','--ignore_set','--ig-set','--igset' then '--ignore-sets' when '--nooffsitelinks','--no-offsite','--nooffsite' then '--no-offsite-links' + when '--nocookies' then '--no-cookies' when '--useragentalias','--user-agent','--useragent' then '--user-agent-alias' when '--concurrent' then '--concurrency' when '--reason' then '--explain' diff --git a/bot/pipeline_options.rb b/bot/pipeline_options.rb index 03b2db54..e1b6be6c 100644 --- a/bot/pipeline_options.rb +++ b/bot/pipeline_options.rb @@ -17,6 +17,11 @@ def run_post_registration_hooks(m, job, params) messages << 'offsite links: no' end + if params[:no_cookies] + job.no_cookies! + messages << 'use cookies: no' + end + if !messages.empty? reply m, "Options: #{messages.join('; ')}" end diff --git a/doc/commands.rst b/doc/commands.rst index ba192dd9..b1765811 100644 --- a/doc/commands.rst +++ b/doc/commands.rst @@ -67,6 +67,9 @@ Accepted parameters Aliases: ``--nooffsitelinks``, ``--no-offsite``, ``--nooffsite`` +``--no-cookies`` + do not use cookies for each request + ``--user-agent-alias ALIAS`` specify a user-agent to use:: diff --git a/lib/job.rb b/lib/job.rb index 48ecc4db..cb6ffcc3 100644 --- a/lib/job.rb +++ b/lib/job.rb @@ -380,6 +380,10 @@ def no_offsite_links! redis.hset(ident, 'no_offsite_links', true) end + def no_cookies! + redis.hset(ident, 'no_cookies', true) + end + def yahoo silently do set_delay(0, 0) diff --git a/pipeline/archivebot/seesaw/tasks.py b/pipeline/archivebot/seesaw/tasks.py index 022fa5df..9b31932e 100644 --- a/pipeline/archivebot/seesaw/tasks.py +++ b/pipeline/archivebot/seesaw/tasks.py @@ -140,6 +140,7 @@ def process(self, item): item['url_file'] = job_data.get('url_file') item['user_agent'] = job_data.get('user_agent') item['no_offsite_links'] = job_data.get('no_offsite_links') + item['no_cookies'] = job_data.get('no_cookies') item['youtube_dl'] = job_data.get('youtube_dl') item.log_output('Received item %s.' % ident) diff --git a/pipeline/archivebot/seesaw/wpull.py b/pipeline/archivebot/seesaw/wpull.py index 48fa0659..c2a8d7a6 100644 --- a/pipeline/archivebot/seesaw/wpull.py +++ b/pipeline/archivebot/seesaw/wpull.py @@ -22,7 +22,6 @@ def make_args(item, default_user_agent, wpull_exe, youtube_dl_exe, finished_warc '-o', '%(item_dir)s/wpull.log' % item, '--database', '%(item_dir)s/wpull.db' % item, '--html-parser', 'libxml2-lxml', - '--save-cookies', '%(cookie_jar)s' % item, '--no-check-certificate', '--no-strong-crypto', '--delete-after', @@ -51,6 +50,11 @@ def make_args(item, default_user_agent, wpull_exe, youtube_dl_exe, finished_warc '--youtube-dl-exe', youtube_dl_exe ] + if item.get('no_cookies'): + args.append('--no-cookies') + else: + add_args(args, ['--save-cookies', '%(cookie_jar)s'], item) + if item['url'].startswith("http://www.reddit.com/") or \ item['url'].startswith("https://www.reddit.com/"): add_args(args, ['--header', 'Cookie: over18=1'], item) diff --git a/spec/bot/job_options_parser_spec.rb b/spec/bot/job_options_parser_spec.rb index ed03b877..9039af68 100644 --- a/spec/bot/job_options_parser_spec.rb +++ b/spec/bot/job_options_parser_spec.rb @@ -53,6 +53,10 @@ expect(parser.parse('--concurrency=4')[:concurrency]).to eq(4) end + it 'recognizes --no-cookies' do + expect(parser.parse('--no-cookies')[:no_cookies]).to eq(true) + end + describe 'when unknown options are present' do it 'raises UnknownOptionError' do expect(lambda { parser.parse('--foo=bar') }).to raise_error(JobOptionsParser::UnknownOptionError)