diff --git a/plot/crawl_size.py b/plot/crawl_size.py index 83eb1f5..7cab6b1 100644 --- a/plot/crawl_size.py +++ b/plot/crawl_size.py @@ -171,18 +171,11 @@ def plot(self): 'Pages / Unique Items Cumulative', 'crawlsize/cumulative.png', data_export_csv='crawlsize/cumulative.csv') - # -- new items per crawl - row_types = ['page', 'url estim. new', - 'digest estim. new'] - self.size_plot(self.size_by_type, row_types, ' new$', - 'New Items per Crawl (not observed in prior crawls)', - 'Pages / New Items', 'crawlsize/monthly_new.png', - data_export_csv='crawlsize/monthly_new.csv') # -- new URLs per crawl row_types = ['url estim. new'] - self.size_plot(self.size_by_type, row_types, ' new$', + self.size_plot(self.size_by_type, row_types, '', 'New URLs per Crawl (not observed in prior crawls)', - '', 'crawlsize/monthly_new_urls.png', + 'New URLs', 'crawlsize/monthly_new.png', data_export_csv='crawlsize/monthly_new.csv') # -- cumulative URLs over last N crawls (this and preceding N-1 crawls) row_types = ['url', '1 crawl', # 'url' replaced by '1 crawl' diff --git a/plots/crawlsize.md b/plots/crawlsize.md index ecb4835..3b2c8c5 100644 --- a/plots/crawlsize.md +++ b/plots/crawlsize.md @@ -27,11 +27,11 @@ Every monthly crawl is a sample of the web and we try to make every monthly snap ![Cumulative size of monthly crawl archives since 2013](./crawlsize/cumulative.png) -The next plot shows the difference in the cumulative size to the preceding crawl. In other words, the amount of new URLs or new content not observed in any of the preceding monthly crawls. +The next plot shows the difference in the cumulative size of URLs to the preceding crawl. In other words, the amount of new URLs, not observed in any of the preceding crawls. -![New Items per Crawl, not observed in prior crawls](./crawlsize/monthly_new.png) +![New URLs per Crawl, not observed in prior crawls](./crawlsize/monthly_new.png) -([New items per crawl as CSV](./crawlsize/monthly_new.csv)) +([New URLs per crawl as CSV](./crawlsize/monthly_new.csv)) How many unique items (in terms of URLs or unique content by digest) are covered by the last n crawls? The coverage over certain time intervals went down early 2015 when continuous donations of verified seeds stopped. Since autumn 2016 we are able to extend the crawl by our own, and we try to increase the coverage for the last n crawls. diff --git a/plots/crawlsize/monthly_new.csv b/plots/crawlsize/monthly_new.csv index 84232b2..083db43 100644 --- a/plots/crawlsize/monthly_new.csv +++ b/plots/crawlsize/monthly_new.csv @@ -1,4 +1,4 @@ -crawl,url estim. +crawl,url estim. new CC-MAIN-2008-2009,1799114116 CC-MAIN-2009-2010,2025520640 CC-MAIN-2012,2875802047 diff --git a/plots/crawlsize/monthly_new.png b/plots/crawlsize/monthly_new.png index c51330f..d1211ea 100644 Binary files a/plots/crawlsize/monthly_new.png and b/plots/crawlsize/monthly_new.png differ diff --git a/plots/crawlsize/monthly_new_urls.png b/plots/crawlsize/monthly_new_urls.png deleted file mode 100644 index c3be3e1..0000000 Binary files a/plots/crawlsize/monthly_new_urls.png and /dev/null differ