From 805a0f09863391930c548e32a5e4671a00aa327e Mon Sep 17 00:00:00 2001 From: acezxn Date: Thu, 28 Aug 2025 16:23:12 -0400 Subject: [PATCH 01/23] Added javascript NPD support --- src/agent/dfbscan.py | 5 + src/llmtool/LLM_utils.py | 2 +- .../dfbscan/intra_dataflow_analyzer.json | 127 +++++++ .../Javascript/dfbscan/path_validator.json | 93 +++++ src/repoaudit.py | 8 + src/run_repoaudit.sh | 6 +- src/tstool/analyzer/Javascript_TS_analyzer.py | 336 ++++++++++++++++++ src/tstool/analyzer/TS_analyzer.py | 10 +- .../Javascript/Javascript_NPD_extractor.py | 41 +++ .../dfbscan_extractor/Javascript/__init__.py | 0 .../Python/Python_NPD_extractor.py | 2 - 11 files changed, 623 insertions(+), 7 deletions(-) create mode 100644 src/prompt/Javascript/dfbscan/intra_dataflow_analyzer.json create mode 100644 src/prompt/Javascript/dfbscan/path_validator.json create mode 100644 src/tstool/analyzer/Javascript_TS_analyzer.py create mode 100644 src/tstool/dfbscan_extractor/Javascript/Javascript_NPD_extractor.py create mode 100644 src/tstool/dfbscan_extractor/Javascript/__init__.py diff --git a/src/agent/dfbscan.py b/src/agent/dfbscan.py index d49073a..027fe0a 100644 --- a/src/agent/dfbscan.py +++ b/src/agent/dfbscan.py @@ -19,6 +19,7 @@ from tstool.dfbscan_extractor.Cpp.Cpp_UAF_extractor import * from tstool.dfbscan_extractor.Java.Java_NPD_extractor import * from tstool.dfbscan_extractor.Python.Python_NPD_extractor import * +from tstool.dfbscan_extractor.Javascript.Javascript_NPD_extractor import * from tstool.dfbscan_extractor.Go.Go_NPD_extractor import * from llmtool.LLM_utils import * @@ -109,9 +110,13 @@ def __obtain_extractor(self) -> DFBScanExtractor: elif self.language == "Python": if self.bug_type == "NPD": return Python_NPD_Extractor(self.ts_analyzer) + elif self.language == "Javascript": + if self.bug_type == "NPD": + return Javascript_NPD_Extractor(self.ts_analyzer) elif self.language == "Go": if self.bug_type == "NPD": return Go_NPD_Extractor(self.ts_analyzer) + raise NotImplementedError( f"Unsupported bug type: {self.bug_type} in {self.language}" ) diff --git a/src/llmtool/LLM_utils.py b/src/llmtool/LLM_utils.py index 843c2db..4976710 100644 --- a/src/llmtool/LLM_utils.py +++ b/src/llmtool/LLM_utils.py @@ -92,7 +92,7 @@ def run_with_timeout(self, func, timeout): def infer_with_gemini(self, message: str) -> str: """Infer using the Gemini model from Google Generative AI""" - gemini_model = genai.GenerativeModel("gemini-pro") + gemini_model = genai.GenerativeModel(self.online_model_name) def call_api(): message_with_role = self.systemRole + "\n" + message diff --git a/src/prompt/Javascript/dfbscan/intra_dataflow_analyzer.json b/src/prompt/Javascript/dfbscan/intra_dataflow_analyzer.json new file mode 100644 index 0000000..740a16c --- /dev/null +++ b/src/prompt/Javascript/dfbscan/intra_dataflow_analyzer.json @@ -0,0 +1,127 @@ +{ + "model_role_name": "Intra-procedural Data Flow Analyzer", + "user_role_name": "Intra-procedural Data Flow Analyzer", + "system_role": "You are a Javascript programmer and very good at analyzing Javascript code. Particularly, you excel at understanding individual Javascript functions and their data flow relationships.", + "task": "Given a specific source variable/parameter/expression (denoted as SRC) at a specific line (denoted as L1), analyze the execution flows of the given function and determine the variables to which SRC can propagate.", + "analysis_rules": [ + "The key principle for answering this question is to extract all execution paths related to SRC and simulate the function's execution along each path to determine where SRC propagates. In Javascript, SRC can propagate to four possible locations:", + "1. Function Calls: SRC propagates to a call site where it is passed as an argument to a callee function within the current function.", + "2. Return Statements: SRC propagates to a return statement, returning a value to the caller of the current function.", + "3. Function Parameters: SRC propagates to a parameter of the current function and can be referenced in the caller function, since objects are passed by reference.", + "4. Sink variables: SRC reaches one of the predefined sink variables provided in the input.", + "If SRC is referenced by function parameters, it can propagate beyond the function scope after the function exits, due to object references being shared between caller and callee. For example, if function goo passes an object base to its callee function foo, and foo(obj: Base) { obj = SRC; }, then the caller function goo can access the updated state of SRC through the object base.", + "To conduct the analysis, follow these three steps:", + "", + "- Step 1: Identify SRC and its aliases within the current function. Extract key points, including:", + " 1. Sink Statements: Where SRC is used or assigned to predefined sink variables.", + " 2. Function Invocations: Call sites where SRC is passed as an argument.", + " 3. Return Statements: Points where the function returns, possibly propagating SRC.", + " 4. Parameter Assignments: Assignments where SRC is assigned to a parameter or an object field that is accessible outside the function.", + "- Step 2: Identify all execution paths relevant to the key points found in Step 1. For each path:", + " - Identify every potential execution path;", + " - Verify whether the key points are executed along each path;", + " - Expand execution paths affected by conditional branches (if-else, switch), loops, and exception-handling blocks.", + "- Step 3: For each execution path extracted in Step 2, simulate function execution line by line and determine where SRC propagates based on the four possible propagation locations.", + "", + "Additional guidelines:", + "- Different propagation paths can exist due to control flow constructs (if-else, loops); identify all execution paths before analysis;", + "- Expand the first iteration of loops to analyze nested execution paths;", + "- Treat each conditional branch (if, switch) as a separate execution path;", + "- Expand nested conditions and loops to ensure all paths are analyzed." + ], + "analysis_examples": [ + "Example 1: Propagation via Sink, Function call, and Return", + "User:", + "Now I will give you a target function with the source point 'p' at line 1:", + "```javascript", + "function process(p) {", + " let a = p; // SRC: a", + " if (a > 0) {", + " let b = a + 1;", + " logValue(a); // Sink value: a", + " } else {", + " let c = a - 1;", + " updateValue(a); // Function call", + " }", + " return a; // Return statement: 'a' is returned", + "}", + "```", + "Where does the source variable 'p' at line 1 propagate within this function?", + "System:", + "Explanation:", + "Step 1: Identify SRC and its alias;", + "SRC: The variable p at line 1 is assigned to variable a;", + "Step 2: Identify key points and execution paths:", + "Path 1 (if branch): When a > 0, variable a is used in the sink statement at line 5 (logValue(a)) and returned at line 10;", + "Path 2 (else branch): When a <= 0, variable a is passed as an argument to updateValue at line 8 and then returned at line 10;", + "Step 3: Simulate each execution path:", + "In Path 1, propagation is from p -> a -> sink at logValue(a) and p -> a -> return a;", + "In Path 2, propagation is from p -> a -> updateValue(a) and p -> a -> return a;", + "Answer:", + "Path 1: Lines 1 -> 2 -> 4 -> 5 -> 10;", + "- Type: Sink; Name: a; Function: None; Index: None; Line: 5; Dependency: SRC (p) is assigned to 'a', then used in logValue;", + "- Type: Return; Name: a; Function: None; Index: 0; Line: 10; Dependency: SRC (p) is assigned to 'a', then returned.", + "Path 2: Lines 1 -> 2 -> 6 -> 8 -> 10;", + "- Type: Argument; Name: a; Function: updateValue; Index: 0; Line: 8; Dependency: SRC (p) is assigned to 'a', then passed as an argument to updateValue;", + "- Type: Return; Name: a; Function: None; Index: 0; Line: 10; Dependency: SRC (p) is assigned to 'a', then returned.", + "", + "Example 2: Propagation via Parameter Passing and Return", + "User:", + "Now I will give you a target function with the source point 'src' at line 1:", + "```javascript", + "function modifyValue(src, flag) {", + " if (flag > 0) {", + " flag = src; // source value: src", + " } else if (flag == 0) {", + " return src; // return statement", + " }", + " return -1; // Default return value ", + "}", + "```", + "Where does the source variable 'src' at line 1 propagate within this function?", + "System:", + "Explanation:", + "Step 1: Identify SRC and its alias;", + "SRC: The variable src is defined at line 1;", + "Step 2: Identify key points and execution paths:", + "Path 1 (flag > 0): src is assigned to flag at line 3, making it accessible outside the function if flag is referenced after the call;", + "Path 2 (flag == 0): src is returned at line 5, propagating to the caller;", + "Path 3 (flag < 0): Function returns -1, so SRC does not propagate in this path;", + "Step 3: Simulate the execution paths:", + "Path 1: When flag > 0, src is assigned to flag, allowing potential propagation outside the function through the parameter reference;", + "Path 2: When flag == 0, src is returned to the caller;", + "Path 3: When flag < 0, src does not propagate, as the function returns -1;", + "Answer:", + "Path 1: Lines 1 -> 3;", + "- Type: Parameter; Name: flag; Function: None; Index: 1; Line: 3; Dependency: SRC (src) is assigned to parameter 'flag', which may be referenced by the caller;", + "Path 2: Lines 1 -> 5;", + "- Type: Return; Name: src; Function: None; Index: 0; Line: 5; Dependency: SRC (src) is returned to the caller;", + "Path 3: Lines 1 -> 6;", + "- No propagation; Dependency: Default return value -1 is unrelated to SRC." + ], + "question_template": "- Where does the source variable at line in this function propagate?", + "answer_format_cot": [ + "(1) First, provide a detailed step-by-step reasoning process, following the explanation format used in the examples;", + "(2) Once the reasoning is complete, begin the final answer section with 'Answer:';", + "(3) For each execution path, list the propagation details using the following format:", + "- Path : ;", + " - For a function argument propagation: 'Type: Argument; Name: {argument name}; Function: {callee function name}; Index: {argument index}; Line: {call site line number}; Dependency: {summary of dependency from SRC to argument}';", + " - For a return propagation: 'Type: Return; Name: {return name}; Function: None; Index: {return value index}; Line: {return statement line number}; Dependency: {summary of dependency from SRC to return value}';", + " - For parameter propagation: 'Type: Parameter; Name: {parameter name}; Function: None; Index: {parameter index}; Line: {assignment line number}; Dependency: {summary of dependency from SRC to parameter}';", + " - For sink propagation: 'Type: Sink; Name: {sink name}; Function: None; Index: None; Line: {sink statement line number}; Dependency: {summary of dependency from SRC to sink}';", + "(4) If there is no propagation along a path, provide a brief explanation of why SRC does not propagate in that path as follows:", + "- Path : ;", + " - No propagation; Dependency: {reason for no propagation};", + "(5) Remember: All the indexes start from 0 instead of 1. If there is only one return value, the index is 0." + ], + "meta_prompts": [ + "Now I will give you a target function with the source point `` at line : \n```\n\n``` \n\n", + "You may see the following statements as potential sink points. Identify which of these are related to SRC and its aliases;\n", + "\n", + "Here are the Function call sites and return statements within the function, which can be used in Step 1;\n", + "\n", + "\n", + "Now, please answer the following question:\n\n", + "Your response should strictly follow the format:\n\n" + ] +} diff --git a/src/prompt/Javascript/dfbscan/path_validator.json b/src/prompt/Javascript/dfbscan/path_validator.json new file mode 100644 index 0000000..a46d22f --- /dev/null +++ b/src/prompt/Javascript/dfbscan/path_validator.json @@ -0,0 +1,93 @@ +{ + "model_role_name": "Path Validator", + "user_role_name": "Path Validator", + "system_role": "You are a Javascript programmer and very good at analyzing Javascript code. In particular, you are skilled at understanding how data flows across multiple functions.", + "task": "You will be provided with an interprocedural data-flow path along with a specified . Your task is to decide whether the given propagation path is reachable – that is, whether its path condition is satisfiable. For example, for NPD (null-pointer dereference) detection, if the dereferenced object is guarded by a branch condition such as 'p !== null', then the path should be deemed unreachable.", + "analysis_rules": [ + "Keep the following guidelines in mind:", + "- If the source in the first function flows to the sink in the last function without any interference, then the path is reachable and your answer should be Yes.", + "- For NPD detection, if the source value is modified or its null/undefined state is verified (for example, via an explicit check like 'p !== null') before reaching the sink, then the path is unreachable and you should answer No.", + "- If a function exits or returns before the sink or other propagation sites (such as function calls) are reached, the path is unreachable; answer No in such cases.", + "- Analyze conditions within each function: infer the outcome of branch statements and then verify whether the conditions across different sub-paths conflict. If conflicts exist, the overall path is unreachable.", + "- Consider the values of relevant variables; if those values contradict the necessary branch conditions for triggering the bug, the path is unreachable and you should answer No.", + "In summary, assess the conditions in every sub-path, check for conflicts, and decide whether the entire propagation path is reachable." + ], + "question_template": [ + "When these functions are executed, does the following data-flow propagation path cause the bug?", + "```", + "", + "```", + "Provide your detailed explanation for this propagation path:", + "", + "" + ], + "analysis_examples": [ + "Example 1:", + "User:", + "Here is the Javascript program:", + "```javascript", + "function getArray(length) {", + " let array = null;", + " if (length > 0) {", + " array = new Array(length);", + " }", + " return array;", + "}", + "", + "function getElement(array, index) {", + " return array[index];", + "}", + "```", + "Does the following propagation path cause the NPD bug?", + "Propagation Path: 'array' at line 2 in getArray --> 'array' used at line 2 in getElement", + "Explanation: In getArray, if length <= 0, array remains null and is returned. In getElement, a null array would trigger a TypeError (null dereference) when accessed at line 10. However, when length > 0, the array is non-null. Since the conditions for array being null and non-null conflict, this propagation path is unreachable and does not cause the NPD bug.", + "Answer: No.", + "", + "Example 2:", + "User:", + "Here is the Javascript program:", + "```javascript", + "function foo(obj) {", + " if (obj === null) {", + " return null;", + " }", + " return obj;", + "}", + "", + "function bar() {", + " const myObj = foo(null);", + " myObj.toString();", + "}", + "```", + "Does the following propagation path cause the NPD bug?", + "Parameter 'obj' in foo --> foo returns null --> myObj assigned null in bar, which then gets dereferenced causing a method call on null", + "Explanation: The function foo returns null when passed a null input. In bar, this leads to myObj being null, which in turn causes a TypeError when calling toString(). As there is no conflicting branch condition preventing this case, the propagation path is reachable and causes the NPD bug.", + "Answer: Yes." + ], + "additional_fact": [ + "Additional details may include whether specific lines fall within if-statements and the corresponding line numbers for those conditions.", + "For each line in the provided path, follow this reasoning:", + "- Indicate whether line {line_number} is inside the 'true' or 'else' branch of an if-statement.", + "- State whether, given the variable values, the branch condition will always be evaluated as true, always as false, or is indeterminate.", + "- Conclude whether line {line_number} is reachable.", + "After analyzing each line, decide if the overall path's condition is satisfiable (reachable) or not." + ], + "answer_format": [ + "(1) In the first line, provide your detailed reasoning and explanation.", + "(2) In the second line, simply state Yes or No.", + "Example:", + "Explanation: {Your detailed explanation.}", + "Answer: Yes" + ], + "meta_prompts": [ + "Now I will provide you with the program:", + "```", + "", + "```", + "Please answer the following question:", + "", + "Your answer should follow this format:", + "", + "Remember: Do not assume the behavior or return values of external methods not provided in the program. Only evaluate the conditions present in the given code." + ] +} diff --git a/src/repoaudit.py b/src/repoaudit.py index 24d3639..042e6ff 100644 --- a/src/repoaudit.py +++ b/src/repoaudit.py @@ -10,6 +10,7 @@ from tstool.analyzer.Go_TS_analyzer import * from tstool.analyzer.Java_TS_analyzer import * from tstool.analyzer.Python_TS_analyzer import * +from tstool.analyzer.Javascript_TS_analyzer import * from typing import List @@ -17,6 +18,7 @@ "Cpp": ["MLK", "NPD", "UAF"], "Java": ["NPD"], "Python": ["NPD"], + "Javascript": ["NPD"], "Go": ["NPD"], } @@ -59,6 +61,8 @@ def __init__( suffixs = ["java"] elif self.language == "Python": suffixs = ["py"] + elif self.language == "Javascript": + suffixs = ["js", "jsx"] else: raise ValueError("Invalid language setting") @@ -82,6 +86,10 @@ def __init__( self.ts_analyzer = Python_TSAnalyzer( self.code_in_files, self.language, self.max_symbolic_workers ) + elif self.language == "Javascript": + self.ts_analyzer = Javascript_TSAnalyzer( + self.code_in_files, self.language, self.max_symbolic_workers + ) return def start_repo_auditing(self) -> None: diff --git a/src/run_repoaudit.sh b/src/run_repoaudit.sh index fbbc8bf..f0c6e92 100755 --- a/src/run_repoaudit.sh +++ b/src/run_repoaudit.sh @@ -1,7 +1,9 @@ #!/bin/bash SCAN_TYPE=$1 -LANGUAGE=Python -MODEL=claude-3.7 +LANGUAGE=Javascript +# MODEL=claude-3.7 +# MODEL=o3-mini +MODEL=gemini-2.5-flash BUG_TYPE=NPD PROJECT=toy diff --git a/src/tstool/analyzer/Javascript_TS_analyzer.py b/src/tstool/analyzer/Javascript_TS_analyzer.py new file mode 100644 index 0000000..510e6a2 --- /dev/null +++ b/src/tstool/analyzer/Javascript_TS_analyzer.py @@ -0,0 +1,336 @@ +import sys +from os import path +from typing import List, Tuple, Dict, Set +import tree_sitter + +sys.path.append(path.dirname(path.dirname(path.dirname(path.abspath(__file__))))) + +from .TS_analyzer import * +from memory.syntactic.function import * +from memory.syntactic.value import * + + +class Javascript_TSAnalyzer(TSAnalyzer): + """ + TSAnalyzer for Javascript source files using tree-sitter. + Implements Javascript-specific parsing and analysis. + """ + + def extract_function_info( + self, file_path: str, source_code: str, tree: tree_sitter.Tree + ) -> None: + """ + Parse the function information in a source file. + :param file_path: The path of the source file. + :param source_code: The content of the source file. + :param tree: The parse tree of the source file. + """ + all_function_header_nodes = find_nodes_by_type( + tree.root_node, "function_declaration" + ) + all_variable_declarator_nodes = find_nodes_by_type( + tree.root_node, "variable_declarator" + ) + + for node in all_function_header_nodes: + function_name = "" + for sub_node in node.children: + if sub_node.type == "identifier": + function_name = source_code[sub_node.start_byte : sub_node.end_byte] + break + + if function_name == "": + continue + + start_line_number = source_code[: node.start_byte].count("\n") + 1 + end_line_number = source_code[: node.end_byte].count("\n") + 1 + function_id = len(self.functionRawDataDic) + 1 + + self.functionRawDataDic[function_id] = ( + function_name, + start_line_number, + end_line_number, + node, + ) + self.functionToFile[function_id] = file_path + + if function_name not in self.functionNameToId: + self.functionNameToId[function_name] = set([]) + self.functionNameToId[function_name].add(function_id) + + for node in all_variable_declarator_nodes: + name_node = node.child_by_field_name("name") + value_node = node.child_by_field_name("value") + + if not name_node or not value_node: + continue + + if value_node.type != "arrow_function": + continue + + function_name = source_code[name_node.start_byte : name_node.end_byte] + start_line = source_code[:node.start_byte].count("\n") + 1 + end_line = source_code[:node.end_byte].count("\n") + 1 + function_id = len(self.functionRawDataDic) + 1 + + self.functionRawDataDic[function_id] = ( + function_name, start_line, end_line, node + ) + self.functionToFile[function_id] = file_path + self.functionNameToId.setdefault(function_name, set()).add(function_id) + + return + + def extract_global_info( + self, file_path: str, source_code: str, tree: tree_sitter.Tree + ) -> None: + """ + Parse global variable information from a Javascript source file. + For Javascript, this may include module-level variables. + Currently not implemented. + """ + # TODO: Add global variable analysis if needed. + return + + def get_callee_name_at_call_site( + self, node: tree_sitter.Node, source_code: str + ) -> str: + """ + Get the callee name at the call site. + :param node: the node of the call site + :param source_code: the content of the file + """ + function_name = "" + for sub_node in node.children: + if sub_node.type == "identifier": + function_name = source_code[sub_node.start_byte : sub_node.end_byte] + break + if sub_node.type == "member_expression": + for sub_sub_node in sub_node.children: + if sub_sub_node.type == "identifier": + function_name = source_code[ + sub_sub_node.start_byte : sub_sub_node.end_byte + ] + break + return function_name + + def get_callsites_by_callee_name( + self, current_function: Function, callee_name: str + ) -> List[tree_sitter.Node]: + """ + Find the call sites by the callee function name. + :param current_function: the function to be analyzed + :param callee_name: the callee function name + """ + results = [] + file_content = self.code_in_files[current_function.file_path] + call_site_nodes = find_nodes_by_type( + current_function.parse_tree_root_node, "call_expression" + ) + for call_site in call_site_nodes: + if ( + self.get_callee_name_at_call_site(call_site, file_content) + == callee_name + ): + results.append(call_site) + return results + + def get_arguments_at_callsite( + self, current_function: Function, call_site_node: tree_sitter.Node + ) -> Set[Value]: + """ + Get arguments from a call site in a function. + :param current_function: the function to be analyzed + :param call_site_node: the node of the call site + :return: the arguments + """ + arguments: Set[Value] = set([]) + file_name = current_function.file_path + source_code = self.code_in_files[file_name] + for sub_node in call_site_node.children: + if sub_node.type == "arguments": + arg_list = sub_node.children[1:-1] + for element in arg_list: + if element.type != ",": + line_number = source_code[: element.start_byte].count("\n") + 1 + arguments.add( + Value( + source_code[element.start_byte : element.end_byte], + line_number, + ValueLabel.ARG, + file_name, + len(arguments), + ) + ) + return arguments + + def get_parameters_in_single_function( + self, current_function: Function + ) -> Set[Value]: + """ + Find the parameters of a function. + :param current_function: The function to be analyzed. + :return: A set of parameters as values + """ + if current_function.paras is not None: + return current_function.paras + current_function.paras = set([]) + file_content = self.code_in_files[current_function.file_path] + parameters = find_nodes_by_type( + current_function.parse_tree_root_node, "formal_parameters" + ) + + index = 0 + for parameter_node in parameters: + parameter_name = "" + for sub_node in parameter_node.children: + for sub_sub_node in find_nodes_by_type(sub_node, "identifier"): + parameter_name = file_content[ + sub_sub_node.start_byte : sub_sub_node.end_byte + ] + if parameter_name != "" and parameter_name != "self": + line_number = ( + file_content[: sub_node.start_byte].count("\n") + 1 + ) + current_function.paras.add( + Value( + parameter_name, + line_number, + ValueLabel.PARA, + current_function.file_path, + index, + ) + ) + index += 1 + return current_function.paras + + def get_return_values_in_single_function( + self, current_function: Function + ) -> Set[Value]: + """ + Find the return values of a Go function + :param current_function: The function to be analyzed. + :return: A set of return values + """ + if current_function.retvals is not None: + return current_function.retvals + + current_function.retvals = set([]) + file_content = self.code_in_files[current_function.file_path] + retnodes = find_nodes_by_type( + current_function.parse_tree_root_node, "return_statement" + ) + for retnode in retnodes: + line_number = file_content[: retnode.start_byte].count("\n") + 1 + restmts_str = file_content[retnode.start_byte : retnode.end_byte] + returned_value = restmts_str.replace("return", "").strip() + current_function.retvals.add( + Value( + returned_value, + line_number, + ValueLabel.RET, + current_function.file_path, + 0, + ) + ) + return current_function.retvals + + def get_if_statements( + self, function: Function, source_code: str + ) -> Dict[Tuple, Tuple]: + """ + Identify if-statements in the Javascript function. + This is a simplified analysis for illustrative purposes. + """ + if_statement_nodes = find_nodes_by_type( + function.parse_tree_root_node, "if_statement" + ) + if_statements = {} + for if_node in if_statement_nodes: + condition_str = "" + condition_start_line = 0 + condition_end_line = 0 + true_branch_start_line = 0 + true_branch_end_line = 0 + else_branch_start_line = 0 + else_branch_end_line = 0 + + block_num = 0 + for sub_target in if_node.children: + if sub_target.type == "parenthesized_expression": + condition_start_line = ( + source_code[: sub_target.start_byte].count("\n") + 1 + ) + condition_end_line = ( + source_code[: sub_target.end_byte].count("\n") + 1 + ) + condition_str = source_code[ + sub_target.start_byte : sub_target.end_byte + ] + if sub_target.type == "statement_block": + lower_lines = [] + upper_lines = [] + for sub_sub in sub_target.children: + if sub_sub.type not in {"{", "}"}: + lower_lines.append( + source_code[: sub_sub.start_byte].count("\n") + 1 + ) + upper_lines.append( + source_code[: sub_sub.end_byte].count("\n") + 1 + ) + if lower_lines and upper_lines: + if block_num == 0: + true_branch_start_line = min(lower_lines) + true_branch_end_line = max(upper_lines) + block_num += 1 + elif block_num == 1: + else_branch_start_line = min(lower_lines) + else_branch_end_line = max(upper_lines) + block_num += 1 + if sub_target.type == "expression_statement": + true_branch_start_line = ( + source_code[: sub_target.start_byte].count("\n") + 1 + ) + true_branch_end_line = ( + source_code[: sub_target.end_byte].count("\n") + 1 + ) + + if_statement_start_line = source_code[: if_node.start_byte].count("\n") + 1 + if_statement_end_line = source_code[: if_node.end_byte].count("\n") + 1 + line_scope = (if_statement_start_line, if_statement_end_line) + info = ( + condition_start_line, + condition_end_line, + condition_str, + (true_branch_start_line, true_branch_end_line), + (else_branch_start_line, else_branch_end_line), + ) + if_statements[line_scope] = info + return if_statements + + def get_loop_statements( + self, function: Function, source_code: str + ) -> Dict[Tuple, Tuple]: + """ + Identify loop statements (for and while) in the Javascript function. + """ + loops = {} + loop_nodes = find_nodes_by_type(function.parse_tree_root_node, "for_statement") + loop_nodes.extend( + find_nodes_by_type(function.parse_tree_root_node, "for_in_statement") + ) + loop_nodes.extend( + find_nodes_by_type(function.parse_tree_root_node, "while_statement") + ) + for node in loop_nodes: + start_line = source_code[: node.start_byte].count("\n") + 1 + end_line = source_code[: node.end_byte].count("\n") + 1 + # Simplified header and body analysis. + loops[(start_line, end_line)] = ( + start_line, + start_line, + "", + start_line, + end_line, + ) + return loops diff --git a/src/tstool/analyzer/TS_analyzer.py b/src/tstool/analyzer/TS_analyzer.py index 31118ab..dcafd26 100644 --- a/src/tstool/analyzer/TS_analyzer.py +++ b/src/tstool/analyzer/TS_analyzer.py @@ -156,6 +156,8 @@ def __init__( self.language = Language(str(language_path), "java") elif language_name == "Python": self.language = Language(str(language_path), "python") + elif language_name == "Javascript": + self.language = Language(str(language_path), "javascript") elif language_name == "Go": self.language = Language(str(language_path), "go") else: @@ -354,7 +356,11 @@ def extract_call_graph_edges(self, current_function: Function) -> None: file_content = self.fileContentDic[file_name] call_node_type = None - if self.language_name == "C" or self.language_name == "Cpp": + if ( + self.language_name == "C" + or self.language_name == "Cpp" + or self.language_name == "Javascript" + ): call_node_type = "call_expression" elif self.language_name == "Java": call_node_type = "method_invocation" @@ -367,7 +373,7 @@ def extract_call_graph_edges(self, current_function: Function) -> None: all_call_sites = find_nodes_by_type( current_function.parse_tree_root_node, call_node_type - ) + ) function_call_sites = [] api_call_sites = [] diff --git a/src/tstool/dfbscan_extractor/Javascript/Javascript_NPD_extractor.py b/src/tstool/dfbscan_extractor/Javascript/Javascript_NPD_extractor.py new file mode 100644 index 0000000..2ee656d --- /dev/null +++ b/src/tstool/dfbscan_extractor/Javascript/Javascript_NPD_extractor.py @@ -0,0 +1,41 @@ +from tstool.analyzer.TS_analyzer import * +from tstool.analyzer.Javascript_TS_analyzer import * +from ..dfbscan_extractor import * + + +class Javascript_NPD_Extractor(DFBScanExtractor): + def extract_sources(self, function: Function) -> List[Value]: + root_node = function.parse_tree_root_node + source_code = self.ts_analyzer.code_in_files[function.file_path] + file_path = function.file_path + null_value_nodes = find_nodes_by_type(root_node, "null") + null_value_nodes.extend(find_nodes_by_type(root_node, "undefined")) + + sources = [] + for node in null_value_nodes: + line_number = source_code[: node.start_byte].count("\n") + 1 + name = source_code[node.start_byte : node.end_byte] + sources.append(Value(name, line_number, ValueLabel.SRC, file_path)) + + return sources + + def extract_sinks(self, function: Function) -> List[Value]: + """ + Extract the sinks that can cause the null pointer dereferences from Javascript programs. + :param: function: Function object. + :return: List of sink values + """ + root_node = function.parse_tree_root_node + source_code = self.ts_analyzer.code_in_files[function.file_path] + file_path = function.file_path + + nodes = find_nodes_by_type(root_node, "member_expression") + nodes.extend(find_nodes_by_type(root_node, "subscript_expression")) + sinks = [] + + for node in nodes: + first_child = node.children[0] + line_number = source_code[: first_child.start_byte].count("\n") + 1 + name = source_code[first_child.start_byte : first_child.end_byte] + sinks.append(Value(name, line_number, ValueLabel.SINK, file_path, -1)) + return sinks diff --git a/src/tstool/dfbscan_extractor/Javascript/__init__.py b/src/tstool/dfbscan_extractor/Javascript/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/tstool/dfbscan_extractor/Python/Python_NPD_extractor.py b/src/tstool/dfbscan_extractor/Python/Python_NPD_extractor.py index c59562d..caca262 100644 --- a/src/tstool/dfbscan_extractor/Python/Python_NPD_extractor.py +++ b/src/tstool/dfbscan_extractor/Python/Python_NPD_extractor.py @@ -1,8 +1,6 @@ from tstool.analyzer.TS_analyzer import * from tstool.analyzer.Python_TS_analyzer import * from ..dfbscan_extractor import * -import tree_sitter -import argparse class Python_NPD_Extractor(DFBScanExtractor): From 024a2a76f5dbb4ac483cddb873ac3fd7824ccf5b Mon Sep 17 00:00:00 2001 From: acezxn Date: Thu, 28 Aug 2025 16:30:29 -0400 Subject: [PATCH 02/23] Modified build.py to build javascript tree sitter and added javascript test cases --- benchmark/Javascript/toy/NPD/case01.js | 17 +++++++++++++++++ benchmark/Javascript/toy/NPD/case02.js | 7 +++++++ benchmark/Javascript/toy/NPD/case03.js | 11 +++++++++++ benchmark/Javascript/toy/NPD/case04.js | 17 +++++++++++++++++ benchmark/Javascript/toy/NPD/case05.js | 20 ++++++++++++++++++++ lib/build.py | 7 +++++++ 6 files changed, 79 insertions(+) create mode 100644 benchmark/Javascript/toy/NPD/case01.js create mode 100644 benchmark/Javascript/toy/NPD/case02.js create mode 100644 benchmark/Javascript/toy/NPD/case03.js create mode 100644 benchmark/Javascript/toy/NPD/case04.js create mode 100644 benchmark/Javascript/toy/NPD/case05.js diff --git a/benchmark/Javascript/toy/NPD/case01.js b/benchmark/Javascript/toy/NPD/case01.js new file mode 100644 index 0000000..e7251ef --- /dev/null +++ b/benchmark/Javascript/toy/NPD/case01.js @@ -0,0 +1,17 @@ +function hello() { + let output = []; + + for (let i = 0; i < 5; i++) { + output.push(null); + } + return output; +} + +function hello2() { + let output = hello(); + for (let i = 0; i < 4; i++) { + output[i] = i.toString(); + } + return output[4].length; +} + diff --git a/benchmark/Javascript/toy/NPD/case02.js b/benchmark/Javascript/toy/NPD/case02.js new file mode 100644 index 0000000..393be53 --- /dev/null +++ b/benchmark/Javascript/toy/NPD/case02.js @@ -0,0 +1,7 @@ +function getLength(value) { + return value.length; +} + +const print = () => { + console.log(getLength(null)); +} diff --git a/benchmark/Javascript/toy/NPD/case03.js b/benchmark/Javascript/toy/NPD/case03.js new file mode 100644 index 0000000..b620a91 --- /dev/null +++ b/benchmark/Javascript/toy/NPD/case03.js @@ -0,0 +1,11 @@ +function getLength2(value) { + if (!value) { + return 0; + } + return value.length; +} + +const print2 = () => { + let a = getLength2(null); + console.log(); +} \ No newline at end of file diff --git a/benchmark/Javascript/toy/NPD/case04.js b/benchmark/Javascript/toy/NPD/case04.js new file mode 100644 index 0000000..5af5d93 --- /dev/null +++ b/benchmark/Javascript/toy/NPD/case04.js @@ -0,0 +1,17 @@ +function hello3() { + let output = []; + + for (let i = 0; i < 5; i++) { + output.push(null); + } + return output; +} + +function hello4() { + let output = hello3(); + for (let i = 0; i < 4; i++) { + output[i] = i.toString(); + } + return output[4] ? output[4].length : 0; +} + diff --git a/benchmark/Javascript/toy/NPD/case05.js b/benchmark/Javascript/toy/NPD/case05.js new file mode 100644 index 0000000..0ef2d26 --- /dev/null +++ b/benchmark/Javascript/toy/NPD/case05.js @@ -0,0 +1,20 @@ +function hello5() { + let output = []; + + for (let i = 0; i < 5; i++) { + output.push(null); + } + return output; +} + +function hello6() { + let output = hello5(); + for (let i = 0; i < 4; i++) { + output[i] = i.toString(); + } + if (output[4] !== null && output[4] !== undefined) { + return output[4].length; + } + return 0; +} + diff --git a/lib/build.py b/lib/build.py index bf7940e..7f59bfd 100644 --- a/lib/build.py +++ b/lib/build.py @@ -25,6 +25,12 @@ os.system( f'git clone https://github.com/tree-sitter/tree-sitter-python.git {cwd / "vendor/tree-sitter-python"}' ) + +if not (cwd / "vendor/tree-sitter-javascript/grammar.js").exists(): + os.system( + f'git clone https://github.com/tree-sitter/tree-sitter-javascript.git {cwd / "vendor/tree-sitter-javascript"}' + ) + if not (cwd / "vendor/tree-sitter-go/grammar.js").exists(): os.system( @@ -41,6 +47,7 @@ str(cwd / "vendor/tree-sitter-cpp"), str(cwd / "vendor/tree-sitter-java"), str(cwd / "vendor/tree-sitter-python"), + str(cwd / "vendor/tree-sitter-javascript"), str(cwd / "vendor/tree-sitter-go"), ], ) From 4ff5057439a48b10a0ceb1059b9d5e825080a70d Mon Sep 17 00:00:00 2001 From: acezxn Date: Fri, 29 Aug 2025 18:05:00 -0400 Subject: [PATCH 03/23] Added delete operator and call expression as source and sink in javascript NPD extraction --- benchmark/Javascript/toy/NPD/case02.js | 12 ++++++-- benchmark/Javascript/toy/NPD/case05.js | 29 +++++++++---------- src/run_repoaudit.sh | 2 +- .../Javascript/Javascript_NPD_extractor.py | 15 ++++++++-- .../dfbscan_extractor/dfbscan_extractor.py | 1 + 5 files changed, 37 insertions(+), 22 deletions(-) diff --git a/benchmark/Javascript/toy/NPD/case02.js b/benchmark/Javascript/toy/NPD/case02.js index 393be53..9003b3d 100644 --- a/benchmark/Javascript/toy/NPD/case02.js +++ b/benchmark/Javascript/toy/NPD/case02.js @@ -1,7 +1,13 @@ -function getLength(value) { - return value.length; +function func_generator(value) { + let fn = null; + if (value % 3 == 0) { + fn = console.log; + } else if (value % 3 == 1) { + fn = console.error; + } + return fn; } const print = () => { - console.log(getLength(null)); + func_generator(8)("Hello world!"); } diff --git a/benchmark/Javascript/toy/NPD/case05.js b/benchmark/Javascript/toy/NPD/case05.js index 0ef2d26..f97f1df 100644 --- a/benchmark/Javascript/toy/NPD/case05.js +++ b/benchmark/Javascript/toy/NPD/case05.js @@ -1,20 +1,17 @@ -function hello5() { - let output = []; - - for (let i = 0; i < 5; i++) { - output.push(null); - } - return output; +function func(value) { + return func2(value); } -function hello6() { - let output = hello5(); - for (let i = 0; i < 4; i++) { - output[i] = i.toString(); - } - if (output[4] !== null && output[4] !== undefined) { - return output[4].length; - } - return 0; +function func2(value) { + console.log(+value.prop); + delete value.prop; + return value; } +const printprop = () => { + let d = { + prop: "1" + }; + d = func(d); + console.log(d.prop.length); +} \ No newline at end of file diff --git a/src/run_repoaudit.sh b/src/run_repoaudit.sh index f0c6e92..5bab301 100755 --- a/src/run_repoaudit.sh +++ b/src/run_repoaudit.sh @@ -3,7 +3,7 @@ SCAN_TYPE=$1 LANGUAGE=Javascript # MODEL=claude-3.7 # MODEL=o3-mini -MODEL=gemini-2.5-flash +MODEL=gemini-2.0-flash BUG_TYPE=NPD PROJECT=toy diff --git a/src/tstool/dfbscan_extractor/Javascript/Javascript_NPD_extractor.py b/src/tstool/dfbscan_extractor/Javascript/Javascript_NPD_extractor.py index 2ee656d..0dc2fd4 100644 --- a/src/tstool/dfbscan_extractor/Javascript/Javascript_NPD_extractor.py +++ b/src/tstool/dfbscan_extractor/Javascript/Javascript_NPD_extractor.py @@ -10,13 +10,23 @@ def extract_sources(self, function: Function) -> List[Value]: file_path = function.file_path null_value_nodes = find_nodes_by_type(root_node, "null") null_value_nodes.extend(find_nodes_by_type(root_node, "undefined")) - + unary_expressions = find_nodes_by_type(root_node, "unary_expression") + sources = [] + + for unary_expression in unary_expressions: + operator = unary_expression.child(0) + if operator is not None and operator.type == "delete": + line_number = source_code[: unary_expression.start_byte].count("\n") + 1 + name = source_code[unary_expression.start_byte : unary_expression.end_byte] + sources.append(Value(name, line_number, ValueLabel.SRC, file_path)) + + for node in null_value_nodes: line_number = source_code[: node.start_byte].count("\n") + 1 name = source_code[node.start_byte : node.end_byte] sources.append(Value(name, line_number, ValueLabel.SRC, file_path)) - + return sources def extract_sinks(self, function: Function) -> List[Value]: @@ -31,6 +41,7 @@ def extract_sinks(self, function: Function) -> List[Value]: nodes = find_nodes_by_type(root_node, "member_expression") nodes.extend(find_nodes_by_type(root_node, "subscript_expression")) + nodes.extend(find_nodes_by_type(root_node, "call_expression")) sinks = [] for node in nodes: diff --git a/src/tstool/dfbscan_extractor/dfbscan_extractor.py b/src/tstool/dfbscan_extractor/dfbscan_extractor.py index d300cdb..d225ecd 100644 --- a/src/tstool/dfbscan_extractor/dfbscan_extractor.py +++ b/src/tstool/dfbscan_extractor/dfbscan_extractor.py @@ -34,6 +34,7 @@ def extract_all(self) -> Tuple[List[Value], List[Value]]: function_root_node = function.parse_tree_root_node self.sources.extend(self.extract_sources(function)) self.sinks.extend(self.extract_sinks(function)) + return self.sources, self.sinks @abstractmethod From 062cd1e399d43b3b2606f33cfb8bc22e1934b1d1 Mon Sep 17 00:00:00 2001 From: acezxn Date: Wed, 3 Sep 2025 21:06:22 -0400 Subject: [PATCH 04/23] Update buggy path computation to exclude sinks of already defined function calls --- .gitignore | 1 + benchmark/Javascript/toy/NPD/case01.js | 21 +++++++------------ benchmark/Javascript/toy/NPD/case02.js | 1 + src/agent/dfbscan.py | 15 ++++++++++--- .../Javascript/dfbscan/path_validator.json | 1 + 5 files changed, 22 insertions(+), 17 deletions(-) diff --git a/.gitignore b/.gitignore index 1e8dd82..31fc4d2 100644 --- a/.gitignore +++ b/.gitignore @@ -172,3 +172,4 @@ cython_debug/ #.idea/ testcases/** +.vscode \ No newline at end of file diff --git a/benchmark/Javascript/toy/NPD/case01.js b/benchmark/Javascript/toy/NPD/case01.js index e7251ef..eda676e 100644 --- a/benchmark/Javascript/toy/NPD/case01.js +++ b/benchmark/Javascript/toy/NPD/case01.js @@ -1,17 +1,10 @@ -function hello() { - let output = []; - - for (let i = 0; i < 5; i++) { - output.push(null); - } - return output; +function test2_process(data) { + let value = data[0]; + return value; } + -function hello2() { - let output = hello(); - for (let i = 0; i < 4; i++) { - output[i] = i.toString(); - } - return output[4].length; +function test2_caller() { + let data = null; + return test2_process(data) } - diff --git a/benchmark/Javascript/toy/NPD/case02.js b/benchmark/Javascript/toy/NPD/case02.js index 9003b3d..f4d20ba 100644 --- a/benchmark/Javascript/toy/NPD/case02.js +++ b/benchmark/Javascript/toy/NPD/case02.js @@ -10,4 +10,5 @@ function func_generator(value) { const print = () => { func_generator(8)("Hello world!"); + console.log("Done"); } diff --git a/src/agent/dfbscan.py b/src/agent/dfbscan.py index 027fe0a..daac04e 100644 --- a/src/agent/dfbscan.py +++ b/src/agent/dfbscan.py @@ -362,9 +362,18 @@ def __collect_potential_buggy_paths( if value.label == ValueLabel.SINK: # For NPD-style bug types if self.is_reachable: - self.state.update_potential_buggy_paths( - src_value, path_with_unknown_status + [value] - ) + + # Checks if the sink is a called to a predefined function + is_defined_function = False + for func in self.ts_analyzer.function_env.values(): + if value.name == func.function_name: + is_defined_function = True + break + + if not is_defined_function: + self.state.update_potential_buggy_paths( + src_value, path_with_unknown_status + [value] + ) elif value.label in { ValueLabel.PARA, ValueLabel.RET, diff --git a/src/prompt/Javascript/dfbscan/path_validator.json b/src/prompt/Javascript/dfbscan/path_validator.json index a46d22f..ae80658 100644 --- a/src/prompt/Javascript/dfbscan/path_validator.json +++ b/src/prompt/Javascript/dfbscan/path_validator.json @@ -8,6 +8,7 @@ "- If the source in the first function flows to the sink in the last function without any interference, then the path is reachable and your answer should be Yes.", "- For NPD detection, if the source value is modified or its null/undefined state is verified (for example, via an explicit check like 'p !== null') before reaching the sink, then the path is unreachable and you should answer No.", "- If a function exits or returns before the sink or other propagation sites (such as function calls) are reached, the path is unreachable; answer No in such cases.", + "- If a sink is a call to an object or a function that is builtin in Javascript or defined in the scope, then the path is unreachable; answer No in such cases.", "- Analyze conditions within each function: infer the outcome of branch statements and then verify whether the conditions across different sub-paths conflict. If conflicts exist, the overall path is unreachable.", "- Consider the values of relevant variables; if those values contradict the necessary branch conditions for triggering the bug, the path is unreachable and you should answer No.", "In summary, assess the conditions in every sub-path, check for conflicts, and decide whether the entire propagation path is reachable." From 6d6a83ea3c13b997f39f207666920930980096bf Mon Sep 17 00:00:00 2001 From: acezxn Date: Fri, 5 Sep 2025 10:23:30 -0400 Subject: [PATCH 05/23] Improved prompt and removed code duplicates in path validator input --- benchmark/Javascript/toy/NPD/case06.js | 16 ++++++++++++++++ src/llmtool/dfbscan/path_validator.py | 9 +++++++-- .../dfbscan/intra_dataflow_analyzer.json | 4 +++- 3 files changed, 26 insertions(+), 3 deletions(-) create mode 100644 benchmark/Javascript/toy/NPD/case06.js diff --git a/benchmark/Javascript/toy/NPD/case06.js b/benchmark/Javascript/toy/NPD/case06.js new file mode 100644 index 0000000..420a328 --- /dev/null +++ b/benchmark/Javascript/toy/NPD/case06.js @@ -0,0 +1,16 @@ +function process_data(myobj) { + const inner_processing = (myobj) => { + delete myobj.func; + return myobj; + } + myobj.func("Hello"); + myobj = inner_processing(myobj); + myobj.func("Hello"); +} + +function main() { + let myobj = { + func: console.log + }; + process_data(myobj) +} \ No newline at end of file diff --git a/src/llmtool/dfbscan/path_validator.py b/src/llmtool/dfbscan/path_validator.py index 069edfc..08a8694 100644 --- a/src/llmtool/dfbscan/path_validator.py +++ b/src/llmtool/dfbscan/path_validator.py @@ -86,11 +86,16 @@ def _get_prompt(self, input: LLMToolInput) -> str: value_lines.append(value_line) prompt = prompt.replace("", "\n".join(value_lines)) prompt = prompt.replace("", input.bug_type) + + functions: Set[Function] = set() + for func in input.values_to_functions.values(): + if func is not None: + functions.add(func) program = "\n".join( [ - "```\n" + func.lined_code + "\n```\n" if func is not None else "\n" - for func in input.values_to_functions.values() + "```\n" + func.lined_code + "\n```\n" + for func in functions ] ) prompt = prompt.replace("", program) diff --git a/src/prompt/Javascript/dfbscan/intra_dataflow_analyzer.json b/src/prompt/Javascript/dfbscan/intra_dataflow_analyzer.json index 740a16c..074e484 100644 --- a/src/prompt/Javascript/dfbscan/intra_dataflow_analyzer.json +++ b/src/prompt/Javascript/dfbscan/intra_dataflow_analyzer.json @@ -24,7 +24,8 @@ "- Step 3: For each execution path extracted in Step 2, simulate function execution line by line and determine where SRC propagates based on the four possible propagation locations.", "", "Additional guidelines:", - "- Different propagation paths can exist due to control flow constructs (if-else, loops); identify all execution paths before analysis;", + "- Different propagation paths can exist due to control flow constructs (if-else, loops, race conditions); identify all execution paths before analysis;", + "- If there are potential race conditions, for example, calling an async function without await, differentiate the control paths to consider the race condition;", "- Expand the first iteration of loops to analyze nested execution paths;", "- Treat each conditional branch (if, switch) as a separate execution path;", "- Expand nested conditions and loops to ensure all paths are analyzed." @@ -112,6 +113,7 @@ "(4) If there is no propagation along a path, provide a brief explanation of why SRC does not propagate in that path as follows:", "- Path : ;", " - No propagation; Dependency: {reason for no propagation};", + "(5) Each Execution Path should start with the word \"Lines\", with each line number separated by \" -> \" and ended with a semicolon.", "(5) Remember: All the indexes start from 0 instead of 1. If there is only one return value, the index is 0." ], "meta_prompts": [ From 9b2d6fb139e2242d04297b5f25c8e8c556131e1c Mon Sep 17 00:00:00 2001 From: acezxn Date: Fri, 5 Sep 2025 11:50:06 -0400 Subject: [PATCH 06/23] Added known javascript builtin that could return null for NPD source detection and tested auditing against microlight.js --- .gitmodules | 3 + benchmark/Javascript/microlight/LICENSE | 22 ++ benchmark/Javascript/microlight/README.md | 15 ++ benchmark/Javascript/microlight/bower.json | 30 +++ benchmark/Javascript/microlight/microlight.js | 210 ++++++++++++++++++ benchmark/Javascript/microlight/package.json | 27 +++ src/tstool/analyzer/Javascript_TS_analyzer.py | 2 +- .../Javascript/Javascript_NPD_extractor.py | 35 ++- 8 files changed, 342 insertions(+), 2 deletions(-) create mode 100644 benchmark/Javascript/microlight/LICENSE create mode 100644 benchmark/Javascript/microlight/README.md create mode 100644 benchmark/Javascript/microlight/bower.json create mode 100644 benchmark/Javascript/microlight/microlight.js create mode 100644 benchmark/Javascript/microlight/package.json diff --git a/.gitmodules b/.gitmodules index 43367c1..9b98d5f 100644 --- a/.gitmodules +++ b/.gitmodules @@ -10,3 +10,6 @@ [submodule "benchmark/Go/sally"] path = benchmark/Go/sally url = https://github.com/uber-go/sally.git +[submodule "benchmark/Javascript/toy/NPD/microlight"] + path = benchmark/Javascript/toy/NPD/microlight + url = https://github.com/asvd/microlight.git diff --git a/benchmark/Javascript/microlight/LICENSE b/benchmark/Javascript/microlight/LICENSE new file mode 100644 index 0000000..b204ed1 --- /dev/null +++ b/benchmark/Javascript/microlight/LICENSE @@ -0,0 +1,22 @@ +The MIT License (MIT) + +Copyright (c) 2016 asvd + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + diff --git a/benchmark/Javascript/microlight/README.md b/benchmark/Javascript/microlight/README.md new file mode 100644 index 0000000..6e815f5 --- /dev/null +++ b/benchmark/Javascript/microlight/README.md @@ -0,0 +1,15 @@ +microlight.js +============= + +*microlight.js* is a tiny library (2.2k minified) which improves + readability of code snippets by highlighting, for any programming + language, without attaching additional language packages or styles: + +![preview](http://asvd.github.io/microlight/microlight-preview-big.png) + +For demos and usage guide, refer to https://asvd.github.io/microlight + +-- + +Follow me on twitter: https://twitter.com/asvd0 + diff --git a/benchmark/Javascript/microlight/bower.json b/benchmark/Javascript/microlight/bower.json new file mode 100644 index 0000000..b89fc78 --- /dev/null +++ b/benchmark/Javascript/microlight/bower.json @@ -0,0 +1,30 @@ +{ + "name": "microlight", + "version": "0.0.7", + "homepage": "https://github.com/asvd/microlight", + "authors": [ + "Dmitry Prokashev " + ], + "description": "highlights code in any language", + "main": "microlight.js", + "moduleType": [ + "amd", + "globals" + ], + "keywords": [ + "syntax", + "highlight", + "highlighting", + "source", + "code", + "source code", + "snippet", + "code snippet" + ], + "license": "MIT", + "ignore": [ + "**/.*", + "node_modules", + "bower_components" + ] +} diff --git a/benchmark/Javascript/microlight/microlight.js b/benchmark/Javascript/microlight/microlight.js new file mode 100644 index 0000000..5aa193a --- /dev/null +++ b/benchmark/Javascript/microlight/microlight.js @@ -0,0 +1,210 @@ +/** + * @fileoverview microlight - syntax highlightning library + * @version 0.0.7 + * + * @license MIT, see http://github.com/asvd/microlight + * @copyright 2016 asvd + * + * Code structure aims at minimizing the compressed library size + */ + + +(function (root, factory) { + if (typeof define === 'function' && define.amd) { + define(['exports'], factory); + } else if (typeof exports !== 'undefined') { + factory(exports); + } else { + factory((root.microlight = {})); + } +}(this, function (exports) { + // for better compression + var _window = window, + _document = document, + appendChild = 'appendChild', + test = 'test', + // style and color templates + textShadow = ';text-shadow:', + opacity = 'opacity:.', + _0px_0px = ' 0px 0px ', + _3px_0px_5 = '3px 0px 5', + brace = ')', + + i, + microlighted, + el; // current microlighted element to run through + + + + var reset = function(cls) { + // nodes to highlight + microlighted = _document.getElementsByClassName(cls||'microlight'); + + for (i = 0; el = microlighted[i++];) { + var text = el.textContent, + pos = 0, // current position + next1 = text[0], // next character + chr = 1, // current character + prev1, // previous character + prev2, // the one before the previous + token = // current token content + el.innerHTML = '', // (and cleaning the node) + + // current token type: + // 0: anything else (whitespaces / newlines) + // 1: operator or brace + // 2: closing braces (after which '/' is division not regex) + // 3: (key)word + // 4: regex + // 5: string starting with " + // 6: string starting with ' + // 7: xml comment + // 8: multiline comment /* */ + // 9: single-line comment starting with two slashes // + // 10: single-line comment starting with hash # + tokenType = 0, + + // kept to determine between regex and division + lastTokenType, + // flag determining if token is multi-character + multichar, + node, + + // calculating the colors for the style templates + colorArr = /(\d*\, \d*\, \d*)(, ([.\d]*))?/g.exec( + _window.getComputedStyle(el).color + ), + pxColor = 'px rgba('+colorArr[1]+',', + alpha = colorArr[3]||1; + + // running through characters and highlighting + while (prev2 = prev1, + // escaping if needed (with except for comments) + // pervious character will not be therefore + // recognized as a token finalize condition + prev1 = tokenType < 7 && prev1 == '\\' ? 1 : chr + ) { + chr = next1; + next1=text[++pos]; + multichar = token.length > 1; + + // checking if current token should be finalized + if (!chr || // end of content + // types 9-10 (single-line comments) end with a + // newline + (tokenType > 8 && chr == '\n') || + [ // finalize conditions for other token types + // 0: whitespaces + /\S/[test](chr), // merged together + // 1: operators + 1, // consist of a single character + // 2: braces + 1, // consist of a single character + // 3: (key)word + !/[$\w]/[test](chr), + // 4: regex + (prev1 == '/' || prev1 == '\n') && multichar, + // 5: string with " + prev1 == '"' && multichar, + // 6: string with ' + prev1 == "'" && multichar, + // 7: xml comment + text[pos-4]+prev2+prev1 == '-->', + // 8: multiline comment + prev2+prev1 == '*/' + ][tokenType] + ) { + // appending the token to the result + if (token) { + // remapping token type into style + // (some types are highlighted similarly) + el[appendChild]( + node = _document.createElement('span') + ).setAttribute('style', [ + // 0: not formatted + '', + // 1: keywords + textShadow + _0px_0px+9+pxColor + alpha * .7 + '),' + + _0px_0px+2+pxColor + alpha * .4 + brace, + // 2: punctuation + opacity + 6 + + textShadow + _0px_0px+7+pxColor + alpha / 4 + '),' + + _0px_0px+3+pxColor + alpha / 4 + brace, + // 3: strings and regexps + opacity + 7 + + textShadow + _3px_0px_5+pxColor + alpha / 5 + '),-' + + _3px_0px_5+pxColor + alpha / 5 + brace, + // 4: comments + 'font-style:italic;'+ + opacity + 5 + + textShadow + _3px_0px_5+pxColor + alpha / 4 + '),-' + + _3px_0px_5+pxColor + alpha / 4 + brace + ][ + // not formatted + !tokenType ? 0 : + // punctuation + tokenType < 3 ? 2 : + // comments + tokenType > 6 ? 4 : + // regex and strings + tokenType > 3 ? 3 : + // otherwise tokenType == 3, (key)word + // (1 if regexp matches, 0 otherwise) + + /^(a(bstract|lias|nd|rguments|rray|s(m|sert)?|uto)|b(ase|egin|ool(ean)?|reak|yte)|c(ase|atch|har|hecked|lass|lone|ompl|onst|ontinue)|de(bugger|cimal|clare|f(ault|er)?|init|l(egate|ete)?)|do|double|e(cho|ls?if|lse(if)?|nd|nsure|num|vent|x(cept|ec|p(licit|ort)|te(nds|nsion|rn)))|f(allthrough|alse|inal(ly)?|ixed|loat|or(each)?|riend|rom|unc(tion)?)|global|goto|guard|i(f|mp(lements|licit|ort)|n(it|clude(_once)?|line|out|stanceof|t(erface|ernal)?)?|s)|l(ambda|et|ock|ong)|m(icrolight|odule|utable)|NaN|n(amespace|ative|ext|ew|il|ot|ull)|o(bject|perator|r|ut|verride)|p(ackage|arams|rivate|rotected|rotocol|ublic)|r(aise|e(adonly|do|f|gister|peat|quire(_once)?|scue|strict|try|turn))|s(byte|ealed|elf|hort|igned|izeof|tatic|tring|truct|ubscript|uper|ynchronized|witch)|t(emplate|hen|his|hrows?|ransient|rue|ry|ype(alias|def|id|name|of))|u(n(checked|def(ined)?|ion|less|signed|til)|se|sing)|v(ar|irtual|oid|olatile)|w(char_t|hen|here|hile|ith)|xor|yield)$/[test](token) + ]); + + node[appendChild](_document.createTextNode(token)); + } + + // saving the previous token type + // (skipping whitespaces and comments) + lastTokenType = + (tokenType && tokenType < 7) ? + tokenType : lastTokenType; + + // initializing a new token + token = ''; + + // determining the new token type (going up the + // list until matching a token type start + // condition) + tokenType = 11; + while (![ + 1, // 0: whitespace + // 1: operator or braces + /[\/{}[(\-+*=<>:;|\\.,?!&@~]/[test](chr), + /[\])]/[test](chr), // 2: closing brace + /[$\w]/[test](chr), // 3: (key)word + chr == '/' && // 4: regex + // previous token was an + // opening brace or an + // operator (otherwise + // division, not a regex) + (lastTokenType < 2) && + // workaround for xml + // closing tags + prev1 != '<', + chr == '"', // 5: string with " + chr == "'", // 6: string with ' + // 7: xml comment + chr+next1+text[pos+1]+text[pos+2] == ' - // 8: multiline comment /* */ - // 9: single-line comment starting with two slashes // - // 10: single-line comment starting with hash # - tokenType = 0, - - // kept to determine between regex and division - lastTokenType, - // flag determining if token is multi-character - multichar, - node, - - // calculating the colors for the style templates - colorArr = /(\d*\, \d*\, \d*)(, ([.\d]*))?/g.exec( - _window.getComputedStyle(el).color - ), - pxColor = 'px rgba('+colorArr[1]+',', - alpha = colorArr[3]||1; - - // running through characters and highlighting - while (prev2 = prev1, - // escaping if needed (with except for comments) - // pervious character will not be therefore - // recognized as a token finalize condition - prev1 = tokenType < 7 && prev1 == '\\' ? 1 : chr - ) { - chr = next1; - next1=text[++pos]; - multichar = token.length > 1; - - // checking if current token should be finalized - if (!chr || // end of content - // types 9-10 (single-line comments) end with a - // newline - (tokenType > 8 && chr == '\n') || - [ // finalize conditions for other token types - // 0: whitespaces - /\S/[test](chr), // merged together - // 1: operators - 1, // consist of a single character - // 2: braces - 1, // consist of a single character - // 3: (key)word - !/[$\w]/[test](chr), - // 4: regex - (prev1 == '/' || prev1 == '\n') && multichar, - // 5: string with " - prev1 == '"' && multichar, - // 6: string with ' - prev1 == "'" && multichar, - // 7: xml comment - text[pos-4]+prev2+prev1 == '-->', - // 8: multiline comment - prev2+prev1 == '*/' - ][tokenType] - ) { - // appending the token to the result - if (token) { - // remapping token type into style - // (some types are highlighted similarly) - el[appendChild]( - node = _document.createElement('span') - ).setAttribute('style', [ - // 0: not formatted - '', - // 1: keywords - textShadow + _0px_0px+9+pxColor + alpha * .7 + '),' + - _0px_0px+2+pxColor + alpha * .4 + brace, - // 2: punctuation - opacity + 6 + - textShadow + _0px_0px+7+pxColor + alpha / 4 + '),' + - _0px_0px+3+pxColor + alpha / 4 + brace, - // 3: strings and regexps - opacity + 7 + - textShadow + _3px_0px_5+pxColor + alpha / 5 + '),-' + - _3px_0px_5+pxColor + alpha / 5 + brace, - // 4: comments - 'font-style:italic;'+ - opacity + 5 + - textShadow + _3px_0px_5+pxColor + alpha / 4 + '),-' + - _3px_0px_5+pxColor + alpha / 4 + brace - ][ - // not formatted - !tokenType ? 0 : - // punctuation - tokenType < 3 ? 2 : - // comments - tokenType > 6 ? 4 : - // regex and strings - tokenType > 3 ? 3 : - // otherwise tokenType == 3, (key)word - // (1 if regexp matches, 0 otherwise) - + /^(a(bstract|lias|nd|rguments|rray|s(m|sert)?|uto)|b(ase|egin|ool(ean)?|reak|yte)|c(ase|atch|har|hecked|lass|lone|ompl|onst|ontinue)|de(bugger|cimal|clare|f(ault|er)?|init|l(egate|ete)?)|do|double|e(cho|ls?if|lse(if)?|nd|nsure|num|vent|x(cept|ec|p(licit|ort)|te(nds|nsion|rn)))|f(allthrough|alse|inal(ly)?|ixed|loat|or(each)?|riend|rom|unc(tion)?)|global|goto|guard|i(f|mp(lements|licit|ort)|n(it|clude(_once)?|line|out|stanceof|t(erface|ernal)?)?|s)|l(ambda|et|ock|ong)|m(icrolight|odule|utable)|NaN|n(amespace|ative|ext|ew|il|ot|ull)|o(bject|perator|r|ut|verride)|p(ackage|arams|rivate|rotected|rotocol|ublic)|r(aise|e(adonly|do|f|gister|peat|quire(_once)?|scue|strict|try|turn))|s(byte|ealed|elf|hort|igned|izeof|tatic|tring|truct|ubscript|uper|ynchronized|witch)|t(emplate|hen|his|hrows?|ransient|rue|ry|ype(alias|def|id|name|of))|u(n(checked|def(ined)?|ion|less|signed|til)|se|sing)|v(ar|irtual|oid|olatile)|w(char_t|hen|here|hile|ith)|xor|yield)$/[test](token) - ]); - - node[appendChild](_document.createTextNode(token)); - } - - // saving the previous token type - // (skipping whitespaces and comments) - lastTokenType = - (tokenType && tokenType < 7) ? - tokenType : lastTokenType; - - // initializing a new token - token = ''; - - // determining the new token type (going up the - // list until matching a token type start - // condition) - tokenType = 11; - while (![ - 1, // 0: whitespace - // 1: operator or braces - /[\/{}[(\-+*=<>:;|\\.,?!&@~]/[test](chr), - /[\])]/[test](chr), // 2: closing brace - /[$\w]/[test](chr), // 3: (key)word - chr == '/' && // 4: regex - // previous token was an - // opening brace or an - // operator (otherwise - // division, not a regex) - (lastTokenType < 2) && - // workaround for xml - // closing tags - prev1 != '<', - chr == '"', // 5: string with " - chr == "'", // 6: string with ' - // 7: xml comment - chr+next1+text[pos+1]+text[pos+2] == '