diff --git a/placeholder.go b/placeholder.go index 4843cd0..3392f77 100644 --- a/placeholder.go +++ b/placeholder.go @@ -3,7 +3,6 @@ package docx import ( "fmt" "log" - "regexp" "strings" ) @@ -20,13 +19,6 @@ func ChangeOpenCloseDelimiter(openDelimiter, closeDelimiter rune) { CloseDelimiter = closeDelimiter } -var ( - // OpenDelimiterRegex is used to quickly match the opening delimiter and find it'str positions. - OpenDelimiterRegex = regexp.MustCompile(string(OpenDelimiter)) - // CloseDelimiterRegex is used to quickly match the closing delimiter and find it'str positions. - CloseDelimiterRegex = regexp.MustCompile(string(CloseDelimiter)) -) - // PlaceholderMap is the type used to map the placeholder keys (without delimiters) to the replacement values type PlaceholderMap map[string]interface{} @@ -73,208 +65,85 @@ func (p Placeholder) Valid() bool { // ParsePlaceholders will, given the document run positions and the bytes, parse out all placeholders including // their fragments. func ParsePlaceholders(runs DocumentRuns, docBytes []byte) (placeholders []*Placeholder, err error) { - // tmp vars used to preserve state across iterations - unclosedPlaceholder := new(Placeholder) - hasOpenPlaceholder := false - + // Use stack to trace the delimiter pair + stack := []*PlaceholderFragment{} for _, run := range runs.WithText() { - runText := run.GetText(docBytes) - - openDelimPositions := OpenDelimiterRegex.FindAllStringIndex(runText, -1) - closeDelimPositions := CloseDelimiterRegex.FindAllStringIndex(runText, -1) - - // FindAllStringIndex returns a [][]int whereas the nested []int has only 2 keys (0 and 1) - // We're only interested in the first key as that one indicates the position of the delimiter - delimPositions := func(positions [][]int) []int { - var pos []int - for _, position := range positions { - pos = append(pos, position[0]) - } - return pos - } - - // index all delimiters - openPos := delimPositions(openDelimPositions) - closePos := delimPositions(closeDelimPositions) - - // In case there are the same amount of open and close delimiters. - // Here we will have three three different sub-cases. - // Case 1 (default): - // '{foo}{bar}' which is the simplest case to handle - // - // Case 2 (special): - // '}foo{bar}foo{' which can easily be detected by checking if 'openPos > endPos'. - // That case can only be valid if there is an unclosed placeholder in a previous run. - // If there is no unclosed placeholder, then there is some form of user error (e.g. '{baz}}foo{bar}'). - // We can also be sure that the first close and the last open delimiters are wrong, all the other ones - // in between will be correct, given the len(openPos)==len(closePos) premise. - // We're ignoring the case in which the user might've entered '}foo}bar{foo{' and went full derp-mode. - // - // Case 3 (nested): - // '{foo{bar}foo}' aka placeholder-nesting, which is acatually not going to be supported - // but needs to be detected and handled anyway. TODO handle nestings - if (len(openPos) == len(closePos)) && len(openPos) != 0 { - - // isSpecialCase checks if, for all found delimiters, startPos > endPos is true (case 2) - isSpecialCase := func() bool { - for i := 0; i < len(openPos); i++ { - start := openPos[i] - end := closePos[i] + 1 // +1 is required to include the closing delimiter in the text - if start > end { - return true - } - } - return false + hasDelimiter := false + runRune := []rune(run.GetText(docBytes)) + for i := 0; i < len(runRune); i++ { + // There is an open delimiter in the run, thus create a partial placeholder fragment + if runRune[i] == OpenDelimiter { + hasDelimiter = true + stack = append(stack, NewPlaceholderFragment(Position{int64(i), -1}, run)) + continue } - // isNestedCase checks if, there are >1 OpenDelimiters before the first CloseDelimiter - // if there is only 1 openPos, this cannot be true (we already know that it's not 0 - isNestedCase := func() bool { - if len(openPos) == 1 { - return false + if runRune[i] == CloseDelimiter { + // There is a close delimiter in the run, 3 scenarios may happen: + // 1) The stack is empty, no open delimiter can match this close delimiter, + // this must be a corrupted placeholder, we log the error and skip + if len(stack) == 0 { + log.Printf( + "detected unmatched close delimiter in run %d \"%s\", index %d, skipping \n", + run.ID, run.GetText(docBytes), i, + ) + continue } - if openPos[0] < closePos[0] && - openPos[1] < closePos[0] { - return true - } - return false - } - // handle case 2 - if isSpecialCase() { - - // handle the easy part (everything between the the culprit first '}' and last '{' in the example of '}foo{bar}foo{' - validOpenPos := openPos[:len(openPos)-1] - validClosePos := closePos[1:] - placeholders = append(placeholders, assembleFullPlaceholders(run, validOpenPos, validClosePos)...) - - // extract the first open and last close delimiter positions as they are the one causing issues. - lastOpenPos := openPos[len(openPos)-1] - firstClosePos := closePos[0] - - // we MUST be having an unclosedPlaceholder or the user made a typo like double-closing ('{foo}}{bar') - if !hasOpenPlaceholder { - return nil, fmt.Errorf("unexpected %c in run %d \"%s\"), missing preceeding %c", CloseDelimiter, run.ID, run.GetText(docBytes), OpenDelimiter) + // 2) The stack is not empty, + hasDelimiter = true + fragment := stack[len(stack)-1] + stack = stack[:len(stack)-1] + if run == fragment.Run { + // a) The close delimiter is in the same run as the open delimiter, then we take + // the partial fragment from the top of the stack, and complete its end position, to make a + // complete placeholder with only 1 fragment. + // e.g., run like: + // foo{bar}baz + // foo{bar}baz{qux}bbb + fragment.Position.End = int64(i) + 1 + placeholders = append(placeholders, &Placeholder{Fragments: []*PlaceholderFragment{fragment}}) + } else { + // b) There are some span runs between the run of open and close delimiter, then we first + // take the partial fragment from the top of the stack, and its end position must be the end of + // that run. Then we create span fragments, with its length set to the run length. Finally, we + // create the fragment that includes the close delimiter, with its start position set to 0, and + // end position set to the position of the close delimiter. + // e.g., run like (here | is the run boundary): + // foo{bar|}baz => {bar} + // foo{bar|abc|}baz => {barabc} + // foo{bar|abc|def|}baz => {barabcdef} + // foo{bar|{bc|d}ef|}baz => {bar{bcd}ef} {bcd} + fragment.Position.End = int64(len(fragment.Run.GetText(docBytes))) + fragments := []*PlaceholderFragment{fragment} + for _, srun := range fragment.SpanRun { + fragments = append( + fragments, + NewPlaceholderFragment(Position{0, int64(len(srun.GetText(docBytes)))}, srun), + ) + } + fragments = append(fragments, NewPlaceholderFragment(Position{0, int64(i) + 1}, run)) + placeholders = append(placeholders, &Placeholder{Fragments: fragments}) } - - // everything up to firstClosePos belongs to the currently open placeholder - fragment := NewPlaceholderFragment(0, Position{0, int64(firstClosePos) + 1}, run) - unclosedPlaceholder.Fragments = append(unclosedPlaceholder.Fragments, fragment) - placeholders = append(placeholders, unclosedPlaceholder) - - // a new, unclosed, placeholder starts at lastOpenPos - fragment = NewPlaceholderFragment(0, Position{int64(lastOpenPos), int64(len(runText))}, run) - unclosedPlaceholder = new(Placeholder) - unclosedPlaceholder.Fragments = append(unclosedPlaceholder.Fragments, fragment) - hasOpenPlaceholder = true - - continue - } - - // there are multiple ways to handle this - // - error - // - cut out - // - skip the run (that's what we do because we're lazy bums) - if isNestedCase() { - log.Printf("detected nested placeholder in run %d \"%s\", skipping \n", run.ID, run.GetText(docBytes)) continue } - - // case 1, assemble and continue - placeholders = append(placeholders, assembleFullPlaceholders(run, openPos, closePos)...) - continue } - - // More open than closing delimiters, e.g. '{foo}{bar' - // this can only mean that a placeholder is left unclosed after this run - // For the length this means that (len(openPos) + 1) == len(closePos) - // So we can be sure that the last position in openPos is the opening tag of the - // unclosed placeholder. - if len(openPos) > len(closePos) { - // merge full placeholders in the run, leaving out the last openPos since - // we know that the one is left over and must be handled separately below - placeholders = append(placeholders, assembleFullPlaceholders(run, openPos[:len(openPos)-1], closePos)...) - - // add the unclosed part of the placeholder to a tmp placeholder var - unclosedOpenPos := openPos[len(openPos)-1] - fragment := NewPlaceholderFragment(0, Position{int64(unclosedOpenPos), int64(len(runText))}, run) - unclosedPlaceholder.Fragments = append(unclosedPlaceholder.Fragments, fragment) - hasOpenPlaceholder = true - continue - } - - // More closing than opening delimiters, e.g. '}{foo}' - // this can only mean that there must be an unclosed placeholder which - // is closed in this run. - if len(openPos) < len(closePos) { - // merge full placeholders in the run, leaving out the last closePos since - // we know that the one is left over and must be handled separately below - placeholders = append(placeholders, assembleFullPlaceholders(run, openPos, closePos[:len(closePos)-1])...) - - // there is only a closePos and no open pos - if len(closePos) == 1 { - fragment := NewPlaceholderFragment(0, Position{0, int64(int64(closePos[0]) + 1)}, run) - unclosedPlaceholder.Fragments = append(unclosedPlaceholder.Fragments, fragment) - placeholders = append(placeholders, unclosedPlaceholder) - unclosedPlaceholder = new(Placeholder) - hasOpenPlaceholder = false - continue - } - continue - } - - // No placeholders at all. - // The run is only relevant if there is an unclosed placeholder from a previous run. - // In that case it means that the full run-text belongs to the placeholder. - // For example, if a placeholder has three fragments in total, this represents fragment 2 (see below) - // 1) '{foo' - // 2) 'bar-' - // 3) '-baz} - if len(openPos) == 0 && len(closePos) == 0 { - if hasOpenPlaceholder { - fragment := NewPlaceholderFragment(0, Position{0, int64(len(runText))}, run) - unclosedPlaceholder.Fragments = append(unclosedPlaceholder.Fragments, fragment) + if !hasDelimiter { + // If a run has no delimiter, it must be a span run. Thus we add the run to all the partial framents that + // has not been closed. + for i := 0; i < len(stack); i++ { + stack[i].SpanRun = append(stack[i].SpanRun, run) continue } } } - // Make sure that we're dealing with valid and proper placeholders only. - // Everything else may cause issues like out of bounds errors or any other sort of weird things. - // Here we will also assemble the final list of placeholders and return only the valid ones. - var validPlaceholders []*Placeholder - for _, placeholder := range placeholders { - if !placeholder.Valid() { - continue - } - - // in order to catch false positives, ensure that all placeholders have BOTH delimiters - text := placeholder.Text(docBytes) - if !strings.ContainsRune(text, OpenDelimiter) || - !strings.ContainsRune(text, CloseDelimiter) { - continue - } - - // placeholder is valid - validPlaceholders = append(validPlaceholders, placeholder) + // Warn user there are some unmatched open delimiters (a.k.a corrupted placeholders) left in the stack + for _, fragment := range stack { + log.Printf("detected unmatched open delimiter in run %d \"%s\", index %d, skipping \n", fragment.Run.ID, fragment.Run.GetText(docBytes), fragment.Position.Start) } - return validPlaceholders, nil -} -// assembleFullPlaceholders will extract all complete placeholders inside the run given a open and close position. -// The open and close positions are the positions of the Delimiters which must already be known at this point. -// openPos and closePos are expected to be symmetrical (e.g. same length). -// Example: openPos := []int{10,20,30}; closePos := []int{13, 23, 33} resulting in 3 fragments (10,13),(20,23),(30,33) -// The n-th elements inside openPos and closePos must be matching delimiter positions. -func assembleFullPlaceholders(run *Run, openPos, closePos []int) (placeholders []*Placeholder) { - for i := 0; i < len(openPos); i++ { - start := openPos[i] - end := closePos[i] + 1 // +1 is required to include the closing delimiter in the text - fragment := NewPlaceholderFragment(0, Position{int64(start), int64(end)}, run) - p := &Placeholder{Fragments: []*PlaceholderFragment{fragment}} - placeholders = append(placeholders, p) - } - return placeholders + return placeholders, nil } // AddPlaceholderDelimiter will wrap the given string with OpenDelimiter and CloseDelimiter. diff --git a/placeholder_fragment.go b/placeholder_fragment.go index 7f3d34a..72c9520 100644 --- a/placeholder_fragment.go +++ b/placeholder_fragment.go @@ -13,17 +13,17 @@ var ( type PlaceholderFragment struct { ID int // ID is used to identify the fragments globally. Position Position // Position of the actual fragment within the run text. 0 == (Run.Text.OpenTag.End + 1) - Number int // numbering fragments for ease of use. Numbering is scoped to placeholders. - Run *Run + //Number int // numbering fragments for ease of use. Numbering is scoped to placeholders. + Run *Run + SpanRun []*Run } // NewPlaceholderFragment returns an initialized PlaceholderFragment with a new, auto-incremented, ID. -func NewPlaceholderFragment(number int, pos Position, run *Run) *PlaceholderFragment { +func NewPlaceholderFragment(pos Position, run *Run) *PlaceholderFragment { return &PlaceholderFragment{ ID: NewFragmentID(), - Position: pos, - Number: number, Run: run, + Position: pos, } } diff --git a/placeholder_test.go b/placeholder_test.go index b63d086..511a4c0 100644 --- a/placeholder_test.go +++ b/placeholder_test.go @@ -46,14 +46,3 @@ func TestParsePlaceholders(t *testing.T) { } } } - -func TestPlaceholder_AssembleFullPlaceholders(t *testing.T) { - expectedCount := 2 - openPos := []int{10, 18} - closePos := []int{17, 25} - - placeholders := assembleFullPlaceholders(&Run{}, openPos, closePos) - if len(placeholders) != expectedCount { - t.Errorf("not all full placeholders were parsed, want=%d, have=%d", expectedCount, len(placeholders)) - } -} diff --git a/replace_test.go b/replace_test.go index 4734e36..f9c8c6d 100644 --- a/replace_test.go +++ b/replace_test.go @@ -18,6 +18,7 @@ func TestReplacer_Replace(t *testing.T) { "mixed-key.separator_styles#": "mixed-key.separator_styles#", "yet-another_placeholder": "yet-another_placeholder", "foo": "foo", + "nested": "nested", } doc, err := Open("./test/template.docx") diff --git a/test/template.docx b/test/template.docx index 368d428..564638a 100644 Binary files a/test/template.docx and b/test/template.docx differ