civitaspo
diff --git a/‎.travis.yml‎
Lines changed: 6 additions & 1 deletion b/‎.travis.yml‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 38 additions & 9 deletions b/‎README.md‎
Lines changed: 38 additions & 9 deletions
diff --git a/‎example/config.yml‎
Lines changed: 3 additions & 1 deletion b/‎example/config.yml‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎example/config_deprecated_option.yml‎
Lines changed: 52 additions & 0 deletions b/‎example/config_deprecated_option.yml‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎src/main/java/org/embulk/output/hdfs/HdfsFileOutput.java‎
Lines changed: 160 additions & 0 deletions b/‎src/main/java/org/embulk/output/hdfs/HdfsFileOutput.java‎
Lines changed: 160 additions & 0 deletions
@@ -1,3 +1,4 @@
+dist: precise
 language: java
 jdk:
   - openjdk7
@@ -6,4 +7,8 @@ jdk:
 script:
   - ./gradlew test
 after_success:
-  - ./gradlew jacocoTestReport coveralls
+  - ./gradlew jacocoTestReport coveralls
+addons:
+  hosts:
+    - example.com
+  hostname: example.com
@@ -14,26 +14,55 @@ A File Output Plugin for Embulk to write HDFS.
 
 ## Configuration
 
-- **config_files** list of paths to Hadoop's configuration files (array of strings, default: `[]`)
-- **config** overwrites configuration parameters (hash, default: `{}`)
-- **path_prefix** prefix of target files (string, required)
-- **file_ext** suffix of target files (string, required)
-- **sequence_format** format for sequence part of target files (string, default: `'%03d.%02d.'`)
-- **rewind_seconds** When you use Date format in path_prefix property(like `/tmp/embulk/%Y-%m-%d/out`), the format is interpreted by using the time which is Now minus this property. (int, default: `0`)
-- **overwrite** overwrite files when the same filenames already exists (boolean, default: `false`)
+- **config_files**: list of paths to Hadoop's configuration files (array of strings, default: `[]`)
+- **config**: overwrites configuration parameters (hash, default: `{}`)
+- **path_prefix**: prefix of target files (string, required)
+- **file_ext**: suffix of target files (string, required)
+- **sequence_format**: format for sequence part of target files (string, default: `'%03d.%02d.'`)
+- **rewind_seconds**: When you use Date format in path_prefix property(like `/tmp/embulk/%Y-%m-%d/out`), the format is interpreted by using the time which is Now minus this property. (int, default: `0`)
+- **doas**: username which access to Hdfs (string, default: executed user)
+- **overwrite** *(Deprecated: Please use `mode` option instead)*: overwrite files when the same filenames already exists (boolean, default: `false`)
     - *caution*: even if this property is `true`, this does not mean ensuring the idempotence. if you want to ensure the idempotence, you need the procedures to remove output files after or before running. 
-- **doas** username which access to Hdfs (string, default: executed user)
-- **delete_in_advance** delete files and directories having `path_prefix` in advance (enum, default: `NONE`)
+- **delete_in_advance** *(Deprecated: Please use `mode` option instead)*: delete files and directories having `path_prefix` in advance (enum, default: `NONE`)
     - `NONE`: do nothing
     - `FILE_ONLY`: delete files
     - `RECURSIVE`: delete files and directories
+- **mode**: "abort_if_exist", "overwrite", "delete_files_in_advance", "delete_recursive_in_advance", or "replace". See below. (string, optional, default: `"abort_if_exist"`)
+    * In the future, default mode will become `"replace"`.
 
 ## CAUTION
 If you use `hadoop` user (hdfs admin user) as `doas`, and if `delete_in_advance` is `RECURSIVE`,
 `embulk-output-hdfs` can delete any files and directories you indicate as `path_prefix`,
 this means `embulk-output-hdfs` can destroy your hdfs.
 So, please be careful when you use `delete_in_advance` option and `doas` option ...
 
+## About DELETE
+
+When this plugin deletes files or directories, use [`Hadoop Trash API`](https://hadoop.apache.org/docs/r2.8.0/api/org/apache/hadoop/fs/Trash.html). So, you can find them in the trash during `fs.trash.interval`.
+
+## Modes
+
+* **abort_if_exist**:
+    * Behavior: This mode writes rows to the target files in order. If target files already exist, abort the transaction.
+    * Transactional: No. If fails, the target files could have some rows written.
+    * Resumable: No.
+* **overwrite**:
+    * Behavior: This mode writes rows to the target files in order. If target files already exist, this re-write from the beginning of the file.
+    * Transactional: No. If fails, the target files could have some rows written.
+    * Resumable: No.
+* **delete_files_in_advance**:
+    * Behavior: This mode delete files at first, then writes rows to the target files in order.
+    * Transactional: No. If fails, the target files could be removed.
+    * Resumable: No.
+* **delete_recursive_in_advance**:
+    * Behavior: This mode delete directories recursively at first, then writes rows to the target files in order.
+    * Transactional: No. If fails, the target files could be removed.
+    * Resumable: No.
+* **replace**:
+    * Behavior: This mode writes rows to the workspace files in order, then replace them to target directories. This **replace** is not **atomic** because hdfs api does not have atomic replace. 
+    * Transactional: No. If fails, the target files could be removed. 
+    * Resumable: No.
+
 ## Example
 
 ```yaml
 
@@ -6,12 +6,14 @@ hdfs_example: &hdfs_example
     fs.defaultFS: 'hdfs://hadoop-nn1:8020'
     fs.hdfs.impl: 'org.apache.hadoop.hdfs.DistributedFileSystem'
     fs.file.impl: 'org.apache.hadoop.fs.LocalFileSystem'
+    fs.trash.interval: 3600
 
 local_fs_example: &local_fs_example
   config:
     fs.defaultFS: 'file:///'
     fs.hdfs.impl: 'org.apache.hadoop.fs.RawLocalFileSystem'
     fs.file.impl: 'org.apache.hadoop.fs.RawLocalFileSystem'
+    fs.trash.interval: 3600
     io.compression.codecs: 'org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.BZip2Codec'
 
 in:
@@ -38,7 +40,7 @@ out:
   <<: *local_fs_example
   path_prefix: /tmp/embulk-output-hdfs_example/file_
   file_ext: csv
-  delete_in_advance: FILE_ONLY
+  mode: replace
   formatter:
     type: csv
     newline: CRLF
 
@@ -0,0 +1,52 @@
+hdfs_example: &hdfs_example
+  config_files:
+    - /etc/hadoop/conf/core-site.xml
+    - /etc/hadoop/conf/hdfs-site.xml
+  config:
+    fs.defaultFS: 'hdfs://hadoop-nn1:8020'
+    fs.hdfs.impl: 'org.apache.hadoop.hdfs.DistributedFileSystem'
+    fs.file.impl: 'org.apache.hadoop.fs.LocalFileSystem'
+
+local_fs_example: &local_fs_example
+  config:
+    fs.defaultFS: 'file:///'
+    fs.hdfs.impl: 'org.apache.hadoop.fs.RawLocalFileSystem'
+    fs.file.impl: 'org.apache.hadoop.fs.RawLocalFileSystem'
+    io.compression.codecs: 'org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.BZip2Codec'
+
+in:
+  type: file
+  path_prefix: example/data
+  parser:
+    charset: UTF-8
+    newline: CRLF
+    type: csv
+    delimiter: ','
+    quote: '"'
+    header_line: true
+    stop_on_invalid_record: true
+    columns:
+    - {name: id, type: long}
+    - {name: account, type: long}
+    - {name: time, type: timestamp, format: '%Y-%m-%d %H:%M:%S'}
+    - {name: purchase, type: timestamp, format: '%Y%m%d'}
+    - {name: comment, type: string}
+
+
+out:
+  type: hdfs
+  <<: *local_fs_example
+  path_prefix: /tmp/embulk-output-hdfs_example/file_
+  file_ext: csv
+  delete_in_advance: FILE_ONLY
+  formatter:
+    type: csv
+    newline: CRLF
+    newline_in_field: LF
+    header_line: true
+    charset: UTF-8
+    quote_policy: NONE
+    quote: '"'
+    escape: '\'
+    null_string: ''
+    default_timezone: UTC
@@ -0,0 +1,160 @@
+package org.embulk.output.hdfs;
+
+import org.apache.hadoop.fs.Path;
+import org.embulk.config.TaskReport;
+import org.embulk.output.hdfs.HdfsFileOutputPlugin.PluginTask;
+import org.embulk.output.hdfs.client.HdfsClient;
+import org.embulk.spi.Buffer;
+import org.embulk.spi.Exec;
+import org.embulk.spi.FileOutput;
+import org.embulk.spi.TransactionalFileOutput;
+import org.embulk.spi.util.RetryExecutor;
+import org.slf4j.Logger;
+
+import java.io.IOException;
+import java.io.OutputStream;
+
+public class HdfsFileOutput
+        implements FileOutput, TransactionalFileOutput
+{
+    private static final Logger logger = Exec.getLogger(HdfsFileOutput.class);
+    private final RetryExecutor re = RetryExecutor.retryExecutor()
+            .withRetryLimit(3)
+            .withMaxRetryWait(500) // ms
+            .withMaxRetryWait(10 * 60 * 1000); // ms
+
+    private final HdfsClient hdfsClient;
+    private final int taskIdx;
+    private final String pathPrefix;
+    private final String sequenceFormat;
+    private final String fileExt;
+    private final boolean overwrite;
+
+    private int fileIdx = 0;
+    private Path currentPath = null;
+    private OutputStream o = null;
+
+    public HdfsFileOutput(PluginTask task, String pathPrefix, boolean overwrite, int taskIdx)
+    {
+        this.hdfsClient = HdfsClient.build(task);
+        this.pathPrefix = pathPrefix;
+        this.taskIdx = taskIdx;
+        this.sequenceFormat = task.getSequenceFormat();
+        this.fileExt = task.getFileExt();
+        this.overwrite = overwrite;
+    }
+
+    @Override
+    public void abort()
+    {
+    }
+
+    @Override
+    public TaskReport commit()
+    {
+        return Exec.newTaskReport();
+    }
+
+    @Override
+    public void nextFile()
+    {
+        closeCurrentStream();
+        currentPath = newPath();
+        fileIdx++;
+    }
+
+    @Override
+    public void add(Buffer buffer)
+    {
+        try {
+            // this implementation is for creating file when there is data.
+            if (o == null) {
+                o = hdfsClient.create(currentPath, overwrite);
+                logger.info("Uploading '{}'", currentPath);
+            }
+            write(buffer);
+        }
+        catch (RetryExecutor.RetryGiveupException e) {
+            throw new RuntimeException(e);
+        }
+        finally {
+            buffer.release();
+        }
+    }
+
+    @Override
+    public void finish()
+    {
+        closeCurrentStream();
+    }
+
+    @Override
+    public void close()
+    {
+        closeCurrentStream();
+        hdfsClient.close();
+    }
+
+    private void write(final Buffer buffer)
+            throws RetryExecutor.RetryGiveupException
+    {
+        re.run(new RetryExecutor.Retryable<Void>()
+        {
+            @Override
+            public Void call()
+                    throws Exception
+            {
+                o.write(buffer.array(), buffer.offset(), buffer.limit());
+                return null;
+            }
+
+            @Override
+            public boolean isRetryableException(Exception exception)
+            {
+                return true; // TODO: which Exception is retryable?
+            }
+
+            @Override
+            public void onRetry(Exception exception, int retryCount, int retryLimit, int retryWait)
+                    throws RetryExecutor.RetryGiveupException
+            {
+                String m = String.format(
+                        "%s. (Retry: Count: %d, Limit: %d, Wait: %d ms)",
+                        exception.getMessage(),
+                        retryCount,
+                        retryLimit,
+                        retryWait);
+                logger.warn(m, exception);
+            }
+
+            @Override
+            public void onGiveup(Exception firstException, Exception lastException)
+                    throws RetryExecutor.RetryGiveupException
+            {
+            }
+        });
+    }
+
+    private Path newPath()
+    {
+        return new Path(pathPrefix + getSequence() + fileExt);
+    }
+
+    private String getSequence()
+    {
+        return String.format(sequenceFormat, taskIdx, fileIdx);
+    }
+
+    private void closeCurrentStream()
+    {
+        if (o != null) {
+            try {
+                o.close();
+                o = null;
+            }
+            catch (IOException e) {
+                throw new RuntimeException(e);
+            }
+        }
+    }
+}