Skip to content

Commit cebdd8a

Browse files
implement uudecoding and add README (#1)
* implement uudecoding and add README * fix typos * support line padding this is how the SEC does their encoding... * test for SEC style input: * remove lone comment
1 parent bf1079f commit cebdd8a

File tree

8 files changed

+917
-0
lines changed

8 files changed

+917
-0
lines changed

README.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# Package UUEncode
2+
3+
A short and sweet Go library that supports decoding uuencoded things.
4+
5+
For more information on what uuencoding is/how it works, check out [this wikipedia article](https://en.wikipedia.org/wiki/Uuencoding).
6+
7+
**Important Note:** This package currently only supports _decoding_ uuencoded contents (because...well...that's all we need here at Polygon.io for now :shrug:).
8+
Contributions are welcome, if you'd like to implement an `Encoder` struct and create a PR we'd be overjoyed :D
9+
10+
uuencoding is an old, rarely unused format at this point and the standard isn't very strict.
11+
There are lots of little variations in different implementations.
12+
13+
This particular implementation is geared towards decoding binary files within SEC filings.
14+
It implements the behavior described in the wikipedia article linked, so it should be relatively portable.
15+
This implementation also adds some extra features to clean up input that doesn't quite conform to the expectations of that format.
16+
17+
There are tests ensuring this package works decoding standard input, input encoded via the `uuencode` utility on macOS, and input encoded in the style that the SEC follows.
18+
19+
## Examples
20+
21+
For examples, check out the test files ([decode](./decode_test.go))

decode.go

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
package uuencode
2+
3+
import (
4+
"bufio"
5+
"bytes"
6+
"encoding/base64"
7+
"fmt"
8+
"io"
9+
"strings"
10+
)
11+
12+
const (
13+
// StandardCharset is the standard charset for uuencoded files: ASCII characters 32 - 95.
14+
StandardCharset = " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_"
15+
16+
// AlternateCharset is the same as the standard charset, except that the space character is replaced by backtick.
17+
// This encoding is non-standard but used occasionally. (Like in the BSD uuencode implementation).
18+
AlternateCharset = "`!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_"
19+
)
20+
21+
// Decoder encapsulates functionality for decoding uuencoded content.
22+
// To create a Decoder, use the helper functions NewStandardDecoder or NewDecoder(charset).
23+
type Decoder struct {
24+
// encoding is used to decode individual lines within the encoded text.
25+
encoding *base64.Encoding
26+
27+
// paddingChar is used to pad lines that have had their padding chopped off for one reason or another.
28+
paddingChar string
29+
}
30+
31+
// NewStandardDecoder returns a Decoder that uses the StandardCharset.
32+
func NewStandardDecoder() Decoder {
33+
return NewDecoder(StandardCharset)
34+
}
35+
36+
// NewDecoder returns a decoder using the given charset.
37+
// See StandardCharset and AlternateCharset for common values.
38+
// Note: the provided charset must be a valid base64 charset, otherwise attempts to Decode may panic.
39+
func NewDecoder(charset string) Decoder {
40+
return Decoder{
41+
encoding: base64.NewEncoding(charset).WithPadding(base64.NoPadding),
42+
paddingChar: string(charset[0]), // Padding char is just the first character in the charset
43+
}
44+
}
45+
46+
// DecodeToBytes is a convenience function for decoding a reader when you just want all the decoded contents in memory in a byte slice.
47+
// See Decode for more info.
48+
func (d Decoder) DecodeToBytes(reader io.Reader) ([]byte, error) {
49+
var buf bytes.Buffer
50+
if err := d.Decode(reader, &buf); err != nil {
51+
return nil, err
52+
}
53+
54+
return buf.Bytes(), nil
55+
}
56+
57+
// Decode decodes the uuencoded contents (as described here: https://en.wikipedia.org/wiki/Uuencoding#Encoded_format)
58+
// of reader and writes the decoded bytes to the given output writer.
59+
// This function assumes there is only one encoded file in the reader, it will ignore anything past the end of the first encoded file.
60+
func (d Decoder) Decode(reader io.Reader, output io.Writer) error {
61+
scanner := bufio.NewScanner(reader)
62+
63+
lineNumber := 0
64+
for scanner.Scan() {
65+
lineNumber++
66+
67+
if scanner.Err() != nil {
68+
return fmt.Errorf("error while scanner reader: %w", scanner.Err())
69+
}
70+
71+
line := scanner.Text()
72+
73+
// We don't care about the begin line, we also don't care about empty lines
74+
if strings.HasPrefix(line, "begin") || line == "" {
75+
continue
76+
}
77+
78+
// When we find the first end line, we're done.
79+
if line == "end" {
80+
return nil
81+
}
82+
83+
// If it's not a begin or end line, first check the line length character.
84+
// If it's the special character backtick (`), the line is empty and we should skip it
85+
lengthChar := line[0]
86+
if lengthChar == '`' {
87+
continue
88+
}
89+
90+
// uuencoding adds 32 to the lengthChar so its a printable character
91+
decodedLen := lengthChar - 32
92+
93+
// Some encoding schemes don't use the special character for empty lines.
94+
if decodedLen == 0 {
95+
continue
96+
}
97+
98+
// The formatted characters are everything after the length char.
99+
// Sometimes padding is omitted from the line, so we have to make sure we add it back before decoding.
100+
expectedLen := d.encoding.EncodedLen(int(decodedLen))
101+
encodedCharacters := d.padContentLine(line[1:], expectedLen)
102+
103+
decoded, err := d.encoding.DecodeString(encodedCharacters)
104+
if err != nil {
105+
return fmt.Errorf("error decoding line %d: %w", lineNumber, err)
106+
}
107+
108+
// Write the decoded bytes to the output writer
109+
if _, err := output.Write(decoded[:decodedLen]); err != nil {
110+
return fmt.Errorf("error writing decoded bytes to writer: %w", err)
111+
}
112+
}
113+
114+
// If we made it out of the loop, it means we never saw the 'end' line
115+
return fmt.Errorf("malformed input; missing 'end' line")
116+
}
117+
118+
func (d Decoder) padContentLine(line string, expectedLen int) string {
119+
for len(line) < expectedLen {
120+
line += d.paddingChar
121+
}
122+
123+
return line
124+
}

decode_test.go

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
package uuencode
2+
3+
import (
4+
"io"
5+
"os"
6+
"strings"
7+
"testing"
8+
9+
"github.com/stretchr/testify/assert"
10+
"github.com/stretchr/testify/require"
11+
)
12+
13+
func TestDecodeToBytes(t *testing.T) {
14+
t.Run("standard encoding", func(t *testing.T) {
15+
input := "begin 644 cat.txt\n" +
16+
"#0V%T\n" +
17+
"`\n" +
18+
"end"
19+
20+
decoder := NewStandardDecoder()
21+
22+
results, err := decoder.DecodeToBytes(strings.NewReader(input))
23+
require.NoError(t, err)
24+
assert.EqualValues(t, "Cat", string(results))
25+
})
26+
}
27+
28+
func TestDecodeFiles(t *testing.T) {
29+
t.Run("BSD/Alternate style encoding", func(t *testing.T) {
30+
// polygon.uu was uuencoded using the `uuencode` utility on macOS
31+
decodeFile(t, NewDecoder(AlternateCharset), "test_data/polygon.uu", "test_data/polygon.jpg")
32+
})
33+
34+
t.Run("standard/SEC style encoding", func(t *testing.T) {
35+
// polygon.sec.uu was encoded in the same style that the SEC uses when disseminating binary files in filings
36+
decodeFile(t, NewStandardDecoder(), "test_data/polygon.sec.uu", "test_data/polygon.jpg")
37+
})
38+
}
39+
40+
func decodeFile(t *testing.T, decoder Decoder, encodedFilename, decodedFilename string) {
41+
encodedFile, err := os.Open(encodedFilename)
42+
require.NoError(t, err)
43+
44+
defer encodedFile.Close()
45+
46+
decodedBytes, err := decoder.DecodeToBytes(encodedFile)
47+
require.NoError(t, err)
48+
49+
expectedFile, err := os.Open(decodedFilename)
50+
require.NoError(t, err)
51+
52+
defer expectedFile.Close()
53+
54+
expectedBytes, err := io.ReadAll(expectedFile)
55+
require.NoError(t, err)
56+
57+
assert.Equal(t, expectedBytes, decodedBytes)
58+
}

go.mod

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
module github.com/polygon-io/uuencode
2+
3+
go 1.16
4+
5+
require github.com/stretchr/testify v1.7.0

go.sum

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
2+
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
3+
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
4+
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
5+
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
6+
github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY=
7+
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
8+
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
9+
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
10+
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo=
11+
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

test_data/polygon.jpg

15.2 KB
Loading

0 commit comments

Comments
 (0)