polygon-io
diff --git a/‎README.md‎
Lines changed: 21 additions & 0 deletions b/‎README.md‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎decode.go‎
Lines changed: 124 additions & 0 deletions b/‎decode.go‎
Lines changed: 124 additions & 0 deletions
diff --git a/‎decode_test.go‎
Lines changed: 58 additions & 0 deletions b/‎decode_test.go‎
Lines changed: 58 additions & 0 deletions
diff --git a/‎go.mod‎
Lines changed: 5 additions & 0 deletions b/‎go.mod‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎go.sum‎
Lines changed: 11 additions & 0 deletions b/‎go.sum‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎test_data/polygon.jpg‎
15.2 KB b/‎test_data/polygon.jpg‎
15.2 KB
@@ -0,0 +1,21 @@
+# Package UUEncode
+
+A short and sweet Go library that supports decoding uuencoded things.
+
+For more information on what uuencoding is/how it works, check out [this wikipedia article](https://en.wikipedia.org/wiki/Uuencoding).
+
+**Important Note:** This package currently only supports _decoding_ uuencoded contents (because...well...that's all we need here at Polygon.io for now :shrug:).
+Contributions are welcome, if you'd like to implement an `Encoder` struct and create a PR we'd be overjoyed :D
+
+uuencoding is an old, rarely unused format at this point and the standard isn't very strict.
+There are lots of little variations in different implementations. 
+
+This particular implementation is geared towards decoding binary files within SEC filings. 
+It implements the behavior described in the wikipedia article linked, so it should be relatively portable.
+This implementation also adds some extra features to clean up input that doesn't quite conform to the expectations of that format.
+
+There are tests ensuring this package works decoding standard input, input encoded via the `uuencode` utility on macOS, and input encoded in the style that the SEC follows. 
+
+## Examples
+
+For examples, check out the test files ([decode](./decode_test.go))
@@ -0,0 +1,124 @@
+package uuencode
+
+import (
+	"bufio"
+	"bytes"
+	"encoding/base64"
+	"fmt"
+	"io"
+	"strings"
+)
+
+const (
+	// StandardCharset is the standard charset for uuencoded files: ASCII characters 32 - 95.
+	StandardCharset = " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_"
+
+	// AlternateCharset is the same as the standard charset, except that the space character is replaced by backtick.
+	// This encoding is non-standard but used occasionally. (Like in the BSD uuencode implementation).
+	AlternateCharset = "`!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_"
+)
+
+// Decoder encapsulates functionality for decoding uuencoded content.
+// To create a Decoder, use the helper functions NewStandardDecoder or NewDecoder(charset).
+type Decoder struct {
+	// encoding is used to decode individual lines within the encoded text.
+	encoding *base64.Encoding
+
+	// paddingChar is used to pad lines that have had their padding chopped off for one reason or another.
+	paddingChar string
+}
+
+// NewStandardDecoder returns a Decoder that uses the StandardCharset.
+func NewStandardDecoder() Decoder {
+	return NewDecoder(StandardCharset)
+}
+
+// NewDecoder returns a decoder using the given charset.
+// See StandardCharset and AlternateCharset for common values.
+// Note: the provided charset must be a valid base64 charset, otherwise attempts to Decode may panic.
+func NewDecoder(charset string) Decoder {
+	return Decoder{
+		encoding:    base64.NewEncoding(charset).WithPadding(base64.NoPadding),
+		paddingChar: string(charset[0]), // Padding char is just the first character in the charset
+	}
+}
+
+// DecodeToBytes is a convenience function for decoding a reader when you just want all the decoded contents in memory in a byte slice.
+// See Decode for more info.
+func (d Decoder) DecodeToBytes(reader io.Reader) ([]byte, error) {
+	var buf bytes.Buffer
+	if err := d.Decode(reader, &buf); err != nil {
+		return nil, err
+	}
+
+	return buf.Bytes(), nil
+}
+
+// Decode decodes the uuencoded contents (as described here: https://en.wikipedia.org/wiki/Uuencoding#Encoded_format)
+// of reader and writes the decoded bytes to the given output writer.
+// This function assumes there is only one encoded file in the reader, it will ignore anything past the end of the first encoded file.
+func (d Decoder) Decode(reader io.Reader, output io.Writer) error {
+	scanner := bufio.NewScanner(reader)
+
+	lineNumber := 0
+	for scanner.Scan() {
+		lineNumber++
+
+		if scanner.Err() != nil {
+			return fmt.Errorf("error while scanner reader: %w", scanner.Err())
+		}
+
+		line := scanner.Text()
+
+		// We don't care about the begin line, we also don't care about empty lines
+		if strings.HasPrefix(line, "begin") || line == "" {
+			continue
+		}
+
+		// When we find the first end line, we're done.
+		if line == "end" {
+			return nil
+		}
+
+		// If it's not a begin or end line, first check the line length character.
+		// If it's the special character backtick (`), the line is empty and we should skip it
+		lengthChar := line[0]
+		if lengthChar == '`' {
+			continue
+		}
+
+		// uuencoding adds 32 to the lengthChar so its a printable character
+		decodedLen := lengthChar - 32
+
+		// Some encoding schemes don't use the special character for empty lines.
+		if decodedLen == 0 {
+			continue
+		}
+
+		// The formatted characters are everything after the length char.
+		// Sometimes padding is omitted from the line, so we have to make sure we add it back before decoding.
+		expectedLen := d.encoding.EncodedLen(int(decodedLen))
+		encodedCharacters := d.padContentLine(line[1:], expectedLen)
+
+		decoded, err := d.encoding.DecodeString(encodedCharacters)
+		if err != nil {
+			return fmt.Errorf("error decoding line %d: %w", lineNumber, err)
+		}
+
+		// Write the decoded bytes to the output writer
+		if _, err := output.Write(decoded[:decodedLen]); err != nil {
+			return fmt.Errorf("error writing decoded bytes to writer: %w", err)
+		}
+	}
+
+	// If we made it out of the loop, it means we never saw the 'end' line
+	return fmt.Errorf("malformed input; missing 'end' line")
+}
+
+func (d Decoder) padContentLine(line string, expectedLen int) string {
+	for len(line) < expectedLen {
+		line += d.paddingChar
+	}
+
+	return line
+}
@@ -0,0 +1,58 @@
+package uuencode
+
+import (
+	"io"
+	"os"
+	"strings"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestDecodeToBytes(t *testing.T) {
+	t.Run("standard encoding", func(t *testing.T) {
+		input := "begin 644 cat.txt\n" +
+			"#0V%T\n" +
+			"`\n" +
+			"end"
+
+		decoder := NewStandardDecoder()
+
+		results, err := decoder.DecodeToBytes(strings.NewReader(input))
+		require.NoError(t, err)
+		assert.EqualValues(t, "Cat", string(results))
+	})
+}
+
+func TestDecodeFiles(t *testing.T) {
+	t.Run("BSD/Alternate style encoding", func(t *testing.T) {
+		// polygon.uu was uuencoded using the `uuencode` utility on macOS
+		decodeFile(t, NewDecoder(AlternateCharset), "test_data/polygon.uu", "test_data/polygon.jpg")
+	})
+
+	t.Run("standard/SEC style encoding", func(t *testing.T) {
+		// polygon.sec.uu was encoded in the same style that the SEC uses when disseminating binary files in filings
+		decodeFile(t, NewStandardDecoder(), "test_data/polygon.sec.uu", "test_data/polygon.jpg")
+	})
+}
+
+func decodeFile(t *testing.T, decoder Decoder, encodedFilename, decodedFilename string) {
+	encodedFile, err := os.Open(encodedFilename)
+	require.NoError(t, err)
+
+	defer encodedFile.Close()
+
+	decodedBytes, err := decoder.DecodeToBytes(encodedFile)
+	require.NoError(t, err)
+
+	expectedFile, err := os.Open(decodedFilename)
+	require.NoError(t, err)
+
+	defer expectedFile.Close()
+
+	expectedBytes, err := io.ReadAll(expectedFile)
+	require.NoError(t, err)
+
+	assert.Equal(t, expectedBytes, decodedBytes)
+}
@@ -0,0 +1,5 @@
+module github.com/polygon-io/uuencode
+
+go 1.16
+
+require github.com/stretchr/testify v1.7.0
@@ -0,0 +1,11 @@
+github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
+github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY=
+github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo=
+gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +module github.com/polygon-io/uuencode
++
 +go 1.16
++
 +require github.com/stretchr/testify v1.7.0