Say, you would like to traverse a function’s AST tree and for every tree node, you would like to find the identifiers and print a message “node has identifier id”.
Let’s take a simple function:
void aexp_test_1 ()
{
x = 8 * ++a - b;
}
The traversal we have in mind is given an AST node, traverse the child nodes, and for each childnode, traverse its childnodes that are identifiers and select both the node and the identifier. Ok, this may not be the most efficient algorithm, but let’s not worry about optimizations for now.
Let’s take this step by step. First we need an AST node:
g.V()
.has('type','Function')
.has('code','aexp_test_1')
.functionToAST()
Then we need its AST nodes:
.astNodes()
Each of these AST nodes needs to be labeled for later use in a select
step:
.as('astnode')
Next, we need to traverse its children, but we are only interested in the identifier nodes, which we will label with identifier
:
.repeat(out(AST_EDGE)).emit{ it.get().value('type') == 'Identifier' }
.as('identifier')
Then, we select both, and we are interested in the code
property:
.select('astnode','identifier')
.by('code')
We run the traversal, but we get an error message:
java.lang.ClassCastException
Something is not right! When a traversal does not work, I try to comment out steps from the end until the traversal works. In this case, it is the by
step that causes problems. Let’s look at the output from the select
step:
{
'identifier':
{'id': 12336, 'type': 'vertex', 'properties': {'_key': [{'id': '746-9io-sl', 'value': '58'}], 'type': [{'id': '7ie-9io-1l1', 'value': 'Identifier'}], 'location': [{'id': '8au-9io-27t1', 'value': ''}], 'isCFGNode': [{'id': '9hi-9io-2a6d', 'value': ''}], 'code': [{'id': '7wm-9io-3yd', 'value': 'a'}], 'functionId': [{'id': '8p2-9io-28lh', 'value': '47'}], 'childNum': [{'id': '93a-9io-29dx', 'value': '1'}]}, 'label': 'vertex'},
'astnode':
[{'id': 49344, 'type': 'vertex', 'properties': {'_key': [{'id': 'zko-122o-sl', 'value': '49'}], 'type': [{'id': 'zyw-122o-1l1', 'value': 'CompoundStatement'}], 'location': [{'id': '10rc-122o-27t1', 'value': '12:0:111:131'}], 'isCFGNode': [{'id': '11y0-122o-2a6d', 'value': ''}], 'code': [{'id': '10d4-122o-3yd', 'value': ''}], 'functionId': [{'id': '115k-122o-28lh', 'value': '47'}], 'childNum': [{'id': '11js-122o-29dx', 'value': '0'}]}, 'label': 'vertex'},
{'id': 8240, 'type': 'vertex', 'properties': {'_key': [{'id': '3ye-6cw-sl', 'value': '50'}], 'type': [{'id': '4cm-6cw-1l1', 'value': 'ExpressionStatement'}], 'location': [{'id': '552-6cw-27t1', 'value': '13:1:114:129'}], 'isCFGNode': [{'id': '6bq-6cw-2a6d', 'value': 'True'}], 'code': [{'id': '4qu-6cw-3yd', 'value': 'x = 8 * ++ a - b'}], 'functionId': [{'id': '5ja-6cw-28lh', 'value': '47'}], 'childNum': [{'id': '5xi-6cw-29dx', 'value': '0'}]}, 'label': 'vertex'},
{'id': 53440, 'type': 'vertex', 'properties': {'_key': [{'id': '12qg-158g-sl', 'value': '51'}], 'type': [{'id': '134o-158g-1l1', 'value': 'AssignmentExpression'}], 'location': [{'id': '13x4-158g-27t1', 'value': ''}], 'isCFGNode': [{'id': '153s-158g-2a6d', 'value': ''}], 'code': [{'id': '13iw-158g-3yd', 'value': 'x = 8 * ++ a - b'}], 'functionId': [{'id': '14bc-158g-28lh', 'value': '47'}], 'childNum': [{'id': '14pk-158g-29dx', 'value': '0'}]}, 'label': 'vertex'},
{'id': 61632, 'type': 'vertex', 'properties': {'_key': [{'id': '1920-1bk0-sl', 'value': '53'}], 'type': [{'id': '19g8-1bk0-1l1', 'value': 'AdditiveExpression'}], 'location': [{'id': '1a8o-1bk0-27t1', 'value': ''}], 'isCFGNode': [{'id': '1bfc-1bk0-2a6d', 'value': ''}], 'code': [{'id': '19ug-1bk0-3yd', 'value': '8 * ++ a - b'}], 'functionId': [{'id': '1amw-1bk0-28lh', 'value': '47'}], 'childNum': [{'id': '1b14-1bk0-29dx', 'value': '1'}]}, 'label': 'vertex'}
]
}
Now that is strange, we did not expect astnode
to be a list. After some experimenting, it turned out to have something to do with emit
. We have used emit
twice in the traversal (one is hidden in astNodes()
).
Let’s investigate and take a simpler traversal. We take the root AST node, label it rootnode
and traverse all its AST children, which will get the label node
.
g.V()
.has('type','Function')
.has('code','aexp_test_1')
.functionToAST()
.as('rootnode')
.repeat(out(AST_EDGE)).emit() // get AST nodes
.as('node')
.select('rootnode','node')
The first result looks like:
{
'rootnode':
{'id': 32992, 'properties': {'isCFGNode': [{'value': '', 'id': 'pb0-pgg-2a6d'}], 'type': [{'value': 'FunctionDef', 'id': 'nbw-pgg-1l1'}], 'childNum': [{'value': '0', 'id': 'ows-pgg-29dx'}], 'functionId': [{'value': '47', 'id': 'oik-pgg-28lh'}], 'code': [{'value': 'aexp_test_1 ()', 'id': 'nq4-pgg-3yd'}], 'location': [{'value': '', 'id': 'o4c-pgg-27t1'}], '_key': [{'value': '48', 'id': 'mxo-pgg-sl'}]}, 'type': 'vertex', 'label': 'vertex'},
'node':
{'id': 16432, 'properties': {'isCFGNode': [{'value': '', 'id': 'cna-cog-2a6d'}], 'type': [{'value': 'ReturnType', 'id': 'ao6-cog-1l1'}], 'childNum': [{'value': '1', 'id': 'c92-cog-29dx'}], 'functionId': [{'value': '47', 'id': 'buu-cog-28lh'}], 'code': [{'value': 'void', 'id': 'b2e-cog-3yd'}], 'location': [{'value': '', 'id': 'bgm-cog-27t1'}], '_key': [{'value': '60', 'id': 'a9y-cog-sl'}]}, 'type': 'vertex', 'label': 'vertex'}
}
But in the last result, node
is a list:
{
'rootnode':
{'id': 32992, 'properties': {'isCFGNode': [{'value': '', 'id': 'pb0-pgg-2a6d'}], 'type': [{'value': 'FunctionDef', 'id': 'nbw-pgg-1l1'}], 'childNum': [{'value': '0', 'id': 'ows-pgg-29dx'}], 'functionId': [{'value': '47', 'id': 'oik-pgg-28lh'}], 'code': [{'value': 'aexp_test_1 ()', 'id': 'nq4-pgg-3yd'}], 'location': [{'value': '', 'id': 'o4c-pgg-27t1'}], '_key': [{'value': '48', 'id': 'mxo-pgg-sl'}]}, 'type': 'vertex', 'label': 'vertex'},
'node':
[
{'id': 49344, 'properties': {'isCFGNode': [{'value': '', 'id': '11y0-122o-2a6d'}], 'type': [{'value': 'CompoundStatement', 'id': 'zyw-122o-1l1'}], 'childNum': [{'value': '0', 'id': '11js-122o-29dx'}], 'functionId': [{'value': '47', 'id': '115k-122o-28lh'}], 'code': [{'value': '', 'id': '10d4-122o-3yd'}], 'location': [{'value': '12:0:111:131', 'id': '10rc-122o-27t1'}], '_key': [{'value': '49', 'id': 'zko-122o-sl'}]}, 'type': 'vertex', 'label': 'vertex'},
{'id': 8240, 'properties': {'isCFGNode': [{'value': 'True', 'id': '6bq-6cw-2a6d'}], 'type': [{'value': 'ExpressionStatement', 'id': '4cm-6cw-1l1'}], 'childNum': [{'value': '0', 'id': '5xi-6cw-29dx'}], 'functionId': [{'value': '47', 'id': '5ja-6cw-28lh'}], 'code': [{'value': 'x = 8 * ++ a - b', 'id': '4qu-6cw-3yd'}], 'location': [{'value': '13:1:114:129', 'id': '552-6cw-27t1'}], '_key': [{'value': '50', 'id': '3ye-6cw-sl'}]}, 'type': 'vertex', 'label': 'vertex'},
{'id': 53440, 'properties': {'isCFGNode': [{'value': '', 'id': '153s-158g-2a6d'}], 'type': [{'value': 'AssignmentExpression', 'id': '134o-158g-1l1'}], 'childNum': [{'value': '0', 'id': '14pk-158g-29dx'}], 'functionId': [{'value': '47', 'id': '14bc-158g-28lh'}], 'code': [{'value': 'x = 8 * ++ a - b', 'id': '13iw-158g-3yd'}], 'location': [{'value': '', 'id': '13x4-158g-27t1'}], '_key': [{'value': '51', 'id': '12qg-158g-sl'}]}, 'type': 'vertex', 'label': 'vertex'},
{'id': 61632, 'properties': {'isCFGNode': [{'value': '', 'id': '1bfc-1bk0-2a6d'}], 'type': [{'value': 'AdditiveExpression', 'id': '19g8-1bk0-1l1'}], 'childNum': [{'value': '1', 'id': '1b14-1bk0-29dx'}], 'functionId': [{'value': '47', 'id': '1amw-1bk0-28lh'}], 'code': [{'value': '8 * ++ a - b', 'id': '19ug-1bk0-3yd'}], 'location': [{'value': '', 'id': '1a8o-1bk0-27t1'}], '_key': [{'value': '53', 'id': '1920-1bk0-sl'}]}, 'type': 'vertex', 'label': 'vertex'},
{'id': 65728, 'properties': {'isCFGNode': [{'value': '', 'id': '1el4-1eps-2a6d'}], 'type': [{'value': 'MultiplicativeExpression', 'id': '1cm0-1eps-1l1'}], 'childNum': [{'value': '0', 'id': '1e6w-1eps-29dx'}], 'functionId': [{'value': '47', 'id': '1dso-1eps-28lh'}], 'code': [{'value': '8 * ++ a', 'id': '1d08-1eps-3yd'}], 'location': [{'value': '', 'id': '1deg-1eps-27t1'}], '_key': [{'value': '54', 'id': '1c7s-1eps-sl'}]}, 'type': 'vertex', 'label': 'vertex'},
{'id': 24816, 'properties': {'isCFGNode': [{'value': '', 'id': 'izi-j5c-2a6d'}], 'type': [{'value': 'UnaryExpression', 'id': 'h0e-j5c-1l1'}], 'childNum': [{'value': '1', 'id': 'ila-j5c-29dx'}], 'functionId': [{'value': '47', 'id': 'i72-j5c-28lh'}], 'code': [{'value': '++ a', 'id': 'hem-j5c-3yd'}], 'location': [{'value': '', 'id': 'hsu-j5c-27t1'}], '_key': [{'value': '56', 'id': 'gm6-j5c-sl'}]}, 'type': 'vertex', 'label': 'vertex'},
{'id': 20616, 'properties': {'isCFGNode': [{'value': '', 'id': 'ftd-fwo-2a6d'}], 'type': [{'value': 'IncDec', 'id': 'du9-fwo-1l1'}], 'childNum': [{'value': '0', 'id': 'ff5-fwo-29dx'}], 'functionId': [{'value': '47', 'id': 'f0x-fwo-28lh'}], 'code': [{'value': '++', 'id': 'e8h-fwo-3yd'}], 'location': [{'value': '', 'id': 'emp-fwo-27t1'}], '_key': [{'value': '57', 'id': 'dg1-fwo-sl'}]}, 'type': 'vertex', 'label': 'vertex'}
]
}
???
In short: if we use emit
normally, we will see only the element that was emitted, but as soon as we start to label it for use with select
, a list of elements is stored.
The Gremlin documentation does not tell what is emitted, but it tells us that emit
splits the traverser in two. This could explain the observed behaviour.
My hypothesis (or wild guess) is that emit
creates a subtraversal that normally is traversed, but when labeled with as
, not the elements in that traversal that are labeled, but the traversal itself. Or something like that.
A solution could be to unfold
the subtraversal. Let’s see how that works:
g.V()
.has('type','Function')
.has('code','aexp_test_1')
.functionToAST()
.as('rootnode')
.repeat(out(AST_EDGE)).emit().unfold() // get AST nodes
.as('node')
.select('rootnode','node')
Now the last node is no longer a list:
{
'rootnode':
{'type': 'vertex', 'label': 'vertex', 'id': 32992, 'properties': {'functionId': [{'id': 'oik-pgg-28lh', 'value': '47'}], 'code': [{'id': 'nq4-pgg-3yd', 'value': 'aexp_test_1 ()'}], 'isCFGNode': [{'id': 'pb0-pgg-2a6d', 'value': ''}], 'location': [{'id': 'o4c-pgg-27t1', 'value': ''}], 'type': [{'id': 'nbw-pgg-1l1', 'value': 'FunctionDef'}], '_key': [{'id': 'mxo-pgg-sl', 'value': '48'}], 'childNum': [{'id': 'ows-pgg-29dx', 'value': '0'}]}},
'node':
{'type': 'vertex', 'label': 'vertex', 'id': 20616, 'properties': {'functionId': [{'id': 'f0x-fwo-28lh', 'value': '47'}], 'code': [{'id': 'e8h-fwo-3yd', 'value': '++'}], 'isCFGNode': [{'id': 'ftd-fwo-2a6d', 'value': ''}], 'location': [{'id': 'emp-fwo-27t1', 'value': ''}], 'type': [{'id': 'du9-fwo-1l1', 'value': 'IncDec'}], '_key': [{'id': 'dg1-fwo-sl', 'value': '57'}], 'childNum': [{'id': 'ff5-fwo-29dx', 'value': '0'}]}}
}
Let’s try this in our original query:
g.V()
.has('type','Function')
.has('code','aexp_test_1')
.functionToAST()
.astNodes().unfold()
.as('astnode')
.repeat(out(AST_EDGE)).emit{ it.get().value('type') == 'Identifier' }.unfold()
.as('identifier')
.select('astnode','identifier')
.by('code')
The result:
{'identifier': 'x', 'astnode': ''}
{'identifier': 'b', 'astnode': ''}
{'identifier': 'a', 'astnode': ''}
{'identifier': 'x', 'astnode': 'x = 8 * ++ a - b'}
{'identifier': 'b', 'astnode': 'x = 8 * ++ a - b'}
{'identifier': 'a', 'astnode': 'x = 8 * ++ a - b'}
{'identifier': 'x', 'astnode': 'x = 8 * ++ a - b'}
{'identifier': 'b', 'astnode': 'x = 8 * ++ a - b'}
{'identifier': 'a', 'astnode': 'x = 8 * ++ a - b'}
{'identifier': 'b', 'astnode': '8 * ++ a - b'}
{'identifier': 'a', 'astnode': '8 * ++ a - b'}
{'identifier': 'a', 'astnode': '8 * ++ a'}
{'identifier': 'a', 'astnode': '++ a'}
It turns out that using only the first unfold
is sufficient, but an extra unfold
does not hurt.
It looks like it worked, but why do we not see the identifiers in the ast nodes? That is because the second traversal tries to traverse AST_EDGE
edges. If that does not work, the traversal will not yield any results. If you also want to emit the elements from which no traversal can be made, you need to emit
before the repeat
:
g.V()
.has('type','Function')
.has('code','aexp_test_1')
.functionToAST()
.astNodes().unfold()
.as('astnode')
.emit{ it.get().value('type') == 'Identifier' }
.repeat(out(AST_EDGE))
.unfold()
.as('identifier')
.select('astnode','identifier')
.by('code')
which results in:
{'astnode': '', 'identifier': 'x'}
{'astnode': '', 'identifier': 'b'}
{'astnode': '', 'identifier': 'a'}
{'astnode': 'aexp_test_1', 'identifier': 'aexp_test_1'}
{'astnode': 'x = 8 * ++ a - b', 'identifier': 'x'}
{'astnode': 'x = 8 * ++ a - b', 'identifier': 'b'}
{'astnode': 'x = 8 * ++ a - b', 'identifier': 'a'}
{'astnode': 'x = 8 * ++ a - b', 'identifier': 'x'}
{'astnode': 'x = 8 * ++ a - b', 'identifier': 'b'}
{'astnode': 'x = 8 * ++ a - b', 'identifier': 'a'}
{'astnode': 'x', 'identifier': 'x'}
{'astnode': '8 * ++ a - b', 'identifier': 'b'}
{'astnode': '8 * ++ a - b', 'identifier': 'a'}
{'astnode': '8 * ++ a', 'identifier': 'a'}
{'astnode': 'b', 'identifier': 'b'}
{'astnode': '++ a', 'identifier': 'a'}
{'astnode': 'a', 'identifier': 'a'}
Strictly speaking, this should also happen in astNodes
:
GraphTraversal.metaClass.astNodes = {
delegate.repeat(__.start().children()).until(noMoreChildren()).emit{true}
}
So, let’s inline the other implementation for now:
g.V()
.has('type','Function')
.has('code','aexp_test_1')
.functionToAST()
.emit().repeat(out(AST_EDGE)).unfold()
.as('astnode')
.emit{ it.get().value('type') == 'Identifier' }
.repeat(out(AST_EDGE))
.unfold()
.as('identifier')
.select('astnode','identifier')
.by('code')
The results still contains duplicates, but all AST nodes are included!
{'identifier': 'aexp_test_1', 'astnode': 'aexp_test_1 ()'}
{'identifier': 'x', 'astnode': 'aexp_test_1 ()'}
{'identifier': 'b', 'astnode': 'aexp_test_1 ()'}
{'identifier': 'a', 'astnode': 'aexp_test_1 ()'}
{'identifier': 'x', 'astnode': ''}
{'identifier': 'b', 'astnode': ''}
{'identifier': 'a', 'astnode': ''}
{'identifier': 'aexp_test_1', 'astnode': 'aexp_test_1'}
{'identifier': 'x', 'astnode': 'x = 8 * ++ a - b'}
{'identifier': 'b', 'astnode': 'x = 8 * ++ a - b'}
{'identifier': 'a', 'astnode': 'x = 8 * ++ a - b'}
{'identifier': 'x', 'astnode': 'x = 8 * ++ a - b'}
{'identifier': 'b', 'astnode': 'x = 8 * ++ a - b'}
{'identifier': 'a', 'astnode': 'x = 8 * ++ a - b'}
{'identifier': 'x', 'astnode': 'x'}
{'identifier': 'b', 'astnode': '8 * ++ a - b'}
{'identifier': 'a', 'astnode': '8 * ++ a - b'}
{'identifier': 'a', 'astnode': '8 * ++ a'}
{'identifier': 'b', 'astnode': 'b'}
{'identifier': 'a', 'astnode': '++ a'}
{'identifier': 'a', 'astnode': 'a'}
To eliminate the duplicates, we insert a dedup
step at the end. The query becomes:
g.V()
.has('type','Function')
.has('code','aexp_test_1')
.functionToAST()
.emit().repeat(out(AST_EDGE)).unfold()
.as('astnode')
.emit{ it.get().value('type') == 'Identifier' }
.repeat(out(AST_EDGE))
.unfold()
.as('identifier')
.select('astnode','identifier')
.by('code')
.dedup()
{'astnode': 'aexp_test_1 ()', 'identifier': 'aexp_test_1'}
{'astnode': 'aexp_test_1 ()', 'identifier': 'x'}
{'astnode': 'aexp_test_1 ()', 'identifier': 'b'}
{'astnode': 'aexp_test_1 ()', 'identifier': 'a'}
{'astnode': '', 'identifier': 'x'}
{'astnode': '', 'identifier': 'b'}
{'astnode': '', 'identifier': 'a'}
{'astnode': 'aexp_test_1', 'identifier': 'aexp_test_1'}
{'astnode': 'x = 8 * ++ a - b', 'identifier': 'x'}
{'astnode': 'x = 8 * ++ a - b', 'identifier': 'b'}
{'astnode': 'x = 8 * ++ a - b', 'identifier': 'a'}
{'astnode': 'x', 'identifier': 'x'}
{'astnode': '8 * ++ a - b', 'identifier': 'b'}
{'astnode': '8 * ++ a - b', 'identifier': 'a'}
{'astnode': '8 * ++ a', 'identifier': 'a'}
{'astnode': 'b', 'identifier': 'b'}
{'astnode': '++ a', 'identifier': 'a'}
{'astnode': 'a', 'identifier': 'a'}