Skip to content

JSP parser throws errors for certain scenarios #127

@CodesAway

Description

@CodesAway

Found during work when using BEX to parse our JSP files. Below is the fix I found. I'll also add some unit tests

	public static ImmutableIntRangeMap<ParsingState> parseJSPTextStates(final CharSequence text) {
		// TODO: used Java as a basic and need to enhance
		// For example, to handle JSP Expression
		// https://www.tutorialspoint.com/jsp/jsp_syntax.htm

		// TODO: need to make RangeMap class and correctly and nested ranges
		// Currently, doesn't work as expected
		// "stuff <%= expression%> more stuff"
		// "More stuff" after the expression should be seen as part of the String literal,
		// but isn't since it gets the last range, which is the expression, which is over
		// Think can fix by end the state when go into a inner state
		// Then, when leave inner state, start a new state based on the outer state

		// TODO: make RangeMap class to handle this
		// When adding a new record, check for overlap using the below logic
		// + An overlap occurs if and only if
		// a) The added range's start in part of an existing range
		// * Can check by finding existing range in map and seeing if the added range's start is in the middle
		// * BEXUtilities.getEntryInRanges
		// b) An existing range's start is contained in the new range
		// * Can do a subRange check on the existing NavigableMap and see if there are any entries
		// If there's an overlap, handle by breaking apart ranges in pieces

		// Parse text to get states
		// * Block comment
		// * Line comment
		// * In String literal
		// * Other stuff?

		// Reference: https://www.tutorialspoint.com/jsp/jsp_syntax.htm

		ImmutableIntRangeMap.Builder<ParsingState> builder = ImmutableIntRangeMap.builder();
		ArrayDeque<ParsingState> stateStack = new ArrayDeque<>();
		ArrayDeque<Integer> startTextInfoStack = new ArrayDeque<>();
		ArrayDeque<Integer> parentStartStack = new ArrayDeque<>();

		boolean isJava = false;
		// HTML tag
		boolean isTag = false;
		// TODO: should I refactor and use this? how would I use it?
		//		String expectedEnd = "";

		for (int i = 0; i < text.length(); i++) {
			//			if (i == 50) {
			//				System.out.println("Debug");
			//			}

			char c = text.charAt(i);

			//			System.out.printf("Index %s%n"
			//					+ "Char %s%n"
			//					+ "States %s%n"
			//					+ "Start %s%n"
			//					+ "Parent %s%n", i, c, stateStack, startTextInfoStack, parentStartStack);

			ParsingState currentState = unwrapParsingState(stateStack.peek());

			//			if (currentState == null) {
			//				System.out.println("Parent: " + i);
			//			}

			if (currentState == IN_STRING_LITERAL) {
				if (c == '\\') {
					// Escape next character
					if (nextChar(text, i) == '\0') {
						break;
					}

					i++;
				} else if (c == '"') {
					popParsingState(i, builder, stateStack, startTextInfoStack, parentStartStack);
				} else if (isTag && hasText(text, i, "<%=")) {
					pushNextLevelParsingState(IN_EXPRESSION_BLOCK, i, builder, stateStack, startTextInfoStack,
							parentStartStack);
					i += 2;

					isJava = true;
				}

				// Other characters don't matter??
				// TODO: handle unicode and other escaping in String literal
			} else if (currentState == IN_SECONDARY_STRING_LITERAL) {
				if (c == '\\') {
					// Escape next character
					if (nextChar(text, i) == '\0') {
						break;
					}

					i++;
				} else if (c == '\'') {
					popParsingState(i, builder, stateStack, startTextInfoStack, parentStartStack);
				} else if (hasText(text, i, "<%=")) {
					pushNextLevelParsingState(IN_EXPRESSION_BLOCK, i, builder, stateStack, startTextInfoStack,
							parentStartStack);
					i += 2;
				}

				// Other characters don't matter??
				// TODO: handle unicode and other escaping in String literal

				// TODO: Java comments only valid in <% code block %>
			} else if (isJava && hasText(text, i, "%>")) {
				isJava = false;

				//				System.out.println("Current: " + currentState
				//						+ "\t"
				//						+ i);

				if (currentState != IN_EXPRESSION_BLOCK) {
					// End the current state on the prior character
					popParsingState(i - 1, builder, stateStack, startTextInfoStack, parentStartStack);
				}

				i++;
				popParsingState(i, builder, stateStack, startTextInfoStack, parentStartStack);
			} else if (isJava && currentState == IN_LINE_COMMENT) {
				if (c == '\n' || c == '\r') {
					popParsingState(i - 1, builder, stateStack, startTextInfoStack, parentStartStack);
					i = handleLineTerminator(i, c, text, builder, stateStack, startTextInfoStack, parentStartStack);
					//					int startTextInfo = startTextInfoStack.pop();
					//					builder.put(IntBEXRange.of(startTextInfo, i), stateStack.pop());
				}
				// Other characters don't matter?
			} else if (isJava && currentState == IN_MULTILINE_COMMENT) {
				if (hasText(text, i, "*/")) {
					i++;
					popParsingState(i, builder, stateStack, startTextInfoStack, parentStartStack);
					//					int startTextInfo = startTextInfoStack.pop();
					//					builder.put(IntBEXRange.closed(startTextInfo, i), stateStack.pop());
				}
			} else if (currentState == IN_MULTILINE_COMMENT) {
				if (hasText(text, i, "--%>")) {
					i += 3;
					popParsingState(i, builder, stateStack, startTextInfoStack, parentStartStack);
				}
			} else if (currentState == IN_SECONDARY_MULTILINE_COMMENT) {
				if (hasText(text, i, "-->")) {
					i += 2;
					popParsingState(i, builder, stateStack, startTextInfoStack, parentStartStack);
				}
			} else if (isJava && c == '/' && nextChar(text, i) == '/') {
				pushNextLevelParsingState(IN_LINE_COMMENT, i, builder, stateStack, startTextInfoStack,
						parentStartStack);
				i++;
			} else if (isJava && c == '/' && nextChar(text, i) == '*') {
				pushNextLevelParsingState(IN_MULTILINE_COMMENT, i, builder, stateStack, startTextInfoStack,
						parentStartStack);
				i++;
			} else if (c == '"' && isTag) {
				pushNextLevelParsingState(IN_STRING_LITERAL, i, builder, stateStack, startTextInfoStack,
						parentStartStack);
			} else if (c == '\'' && isTag) {
				pushNextLevelParsingState(IN_SECONDARY_STRING_LITERAL, i, builder, stateStack,
						startTextInfoStack, parentStartStack);
			} else if (c == '"' && isJava) {
				pushParsingState(IN_STRING_LITERAL, i, stateStack, startTextInfoStack, parentStartStack);
			} else if (c == '\'' && isJava) {
				pushParsingState(IN_SECONDARY_STRING_LITERAL, i, stateStack, startTextInfoStack, parentStartStack);
			} else if (hasText(text, i, "<%--")) {
				pushParsingState(IN_MULTILINE_COMMENT, i, stateStack, startTextInfoStack, parentStartStack);
				i += 3;
			} else if (hasText(text, i, "<!--")) {
				pushParsingState(IN_SECONDARY_MULTILINE_COMMENT, i, stateStack, startTextInfoStack, parentStartStack);
				i += 3;
			} else if (hasText(text, i, "<%=")) {
				// In Java expression
				pushParsingState(IN_EXPRESSION_BLOCK, i, stateStack, startTextInfoStack, parentStartStack);
				i += 2;
				isJava = true;
			} else if (hasText(text, i, "<%!")) {
				pushParsingState(IN_EXPRESSION_BLOCK, i, stateStack, startTextInfoStack, parentStartStack);
				i += 2;
				isJava = true;
			} else if (hasText(text, i, "<%")) {
				// In Java scriptlet
				pushParsingState(IN_EXPRESSION_BLOCK, i, stateStack, startTextInfoStack, parentStartStack);
				i++;
				isJava = true;
			} else if (c == '<' && !isJava && !isTag) {
				pushParsingState(IN_TAG, i, stateStack, startTextInfoStack, parentStartStack);
				isTag = true;
			} else if (c == '>' && isTag && !isJava) {
				isTag = false;
				popParsingState(i, builder, stateStack, startTextInfoStack, parentStartStack);
			} else if (Character.isWhitespace(c)) {
				i = handleWhitespace(i, c, text, builder, stateStack, startTextInfoStack, parentStartStack);
			}
		}

		if (!stateStack.isEmpty()) {
			// TODO: what if there are multiple entries?
			// (this would suggest improperly formatted code)
			int startTextInfo = startTextInfoStack.pop();
			// TODO: does there need to be a parent?
			if (startTextInfo != text.length()) {
				builder.put(IntBEXRange.of(startTextInfo, text.length()), stateStack.pop());
			}
		}

		return builder.build();
	}

Metadata

Metadata

Assignees

Labels

No labels
No labels

Projects

No projects

Relationships

None yet

Development

No branches or pull requests

Issue actions