Skip to content

Commit dbd4c89

Browse files
authored
fix(shared): add text content for leaf elements before text node (#837)
* fix(shared): add text content for leaf elements before text node * fix(core): return too early * fix(core): update page description condition for non-vl mode * test(core): add bug test case
1 parent 5f79ef8 commit dbd4c89

File tree

5 files changed

+1215
-21
lines changed

5 files changed

+1215
-21
lines changed

biome.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,8 @@
2727
"**/unpacked-extension/*",
2828
"**/page-data/**",
2929
"**/dump.json",
30-
"**/dump-with-invisible.json"
30+
"**/dump-with-invisible.json",
31+
"**/dump-for-utils-test.json"
3132
]
3233
},
3334
"javascript": {

packages/core/src/ai-model/prompt/util.ts

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import { imageInfoOfBase64 } from '@/image/index';
22
import type { BaseElement, ElementTreeNode, Size, UIContext } from '@/types';
33
import { NodeType } from '@midscene/shared/constants';
4+
import { vlLocateMode } from '@midscene/shared/env';
45
import {
56
descriptionOfTree,
67
generateElementByPosition,
@@ -61,14 +62,14 @@ export function elementByPositionWithElementInfo(
6162
position.y <= item.rect.top + item.rect.height
6263
) {
6364
if (
64-
filterPositionElements &&
65-
item.attributes?.nodeType === NodeType.POSITION
65+
!(
66+
filterPositionElements &&
67+
item.attributes?.nodeType === NodeType.POSITION
68+
) &&
69+
item.isVisible
6670
) {
67-
// Skip POSITION elements if filterPositionElements is true
68-
return;
71+
matchingElements.push(item);
6972
}
70-
71-
matchingElements.push(item);
7273
}
7374
}
7475

@@ -167,7 +168,8 @@ export async function describeUserPage<
167168

168169
let pageDescription = '';
169170
const visibleOnly = opt?.visibleOnly ?? opt?.domIncluded === 'visible-only';
170-
if (opt?.domIncluded) {
171+
if (opt?.domIncluded || !vlLocateMode()) {
172+
// non-vl mode must provide the page description
171173
const contentTree = await descriptionOfTree(
172174
treeRoot,
173175
opt?.truncateTextLength,

packages/core/tests/fixtures/dump-for-utils-test.json

Lines changed: 982 additions & 0 deletions
Large diffs are not rendered by default.

packages/core/tests/unit-test/prompt/utils.test.ts

Lines changed: 198 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,32 @@
1-
import { describeUserPage } from '@/ai-model/prompt/util';
21
import { treeToList } from '@midscene/shared/extractor';
32
import { getContextFromFixture } from 'tests/evaluation';
4-
import { describe, expect, it } from 'vitest';
3+
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
54

6-
describe('prompt utils', () => {
5+
// Mock the env module before importing the function that uses it
6+
vi.mock('@midscene/shared/env', () => ({
7+
vlLocateMode: vi.fn(() => 'qwen-vl' as const), // default to 'qwen-vl'
8+
}));
9+
10+
import fs from 'node:fs';
11+
import path from 'node:path';
12+
import {
13+
describeUserPage,
14+
elementByPositionWithElementInfo,
15+
} from '@/ai-model/prompt/util';
16+
import { vlLocateMode } from '@midscene/shared/env';
17+
18+
describe('prompt utils - describeUserPage', () => {
719
let lengthOfDescription: number;
20+
21+
beforeEach(() => {
22+
vi.clearAllMocks();
23+
});
24+
25+
afterEach(() => {
26+
// Reset to default value
27+
vi.mocked(vlLocateMode).mockReturnValue('qwen-vl');
28+
});
29+
830
it('describe context ', async () => {
931
const context = await getContextFromFixture('taobao');
1032
const { description } = await describeUserPage(context.context, {
@@ -49,4 +71,177 @@ describe('prompt utils', () => {
4971
treeToList(context.context.tree).length,
5072
);
5173
});
74+
75+
it('describe context with non-vl mode', async () => {
76+
// Mock vlLocateMode to return false for this test
77+
vi.mocked(vlLocateMode).mockReturnValue(false);
78+
79+
const context = await getContextFromFixture('taobao');
80+
const { description } = await describeUserPage(context.context, {
81+
domIncluded: false,
82+
});
83+
84+
// In non-vl mode, description should include page elements even when domIncluded is false
85+
expect(description).toBeTruthy();
86+
});
87+
88+
it('describe context with vl mode', async () => {
89+
// Mock vlLocateMode to return a VL mode for this test
90+
vi.mocked(vlLocateMode).mockReturnValue('qwen-vl');
91+
92+
const context = await getContextFromFixture('taobao');
93+
const { description } = await describeUserPage(context.context, {
94+
domIncluded: false,
95+
});
96+
97+
// In vl mode, description should be empty if domIncluded is false
98+
expect(description).toBeFalsy();
99+
});
100+
});
101+
102+
describe('prompt utils - elementByPositionWithElementInfo', () => {
103+
it('should return the correct element at the position(filter invisible elements)', async () => {
104+
const dumpPath = path.join(
105+
__dirname,
106+
'../../',
107+
'fixtures',
108+
'dump-for-utils-test.json',
109+
);
110+
const dump = JSON.parse(fs.readFileSync(dumpPath, 'utf8'));
111+
const targetNode = {
112+
node: {
113+
content: '选好了',
114+
rect: {
115+
left: 138,
116+
top: 849,
117+
width: 247,
118+
height: 38,
119+
zoom: 1,
120+
isVisible: true,
121+
},
122+
center: [261, 868],
123+
id: 'hdocg',
124+
indexId: 263,
125+
attributes: {
126+
type: 'button',
127+
class: '.submit-btn.ant-btn.ant-btn-primary.ant-btn-lg.ant-btn-block',
128+
htmlTagName: '<button>',
129+
nodeType: 'BUTTON Node',
130+
},
131+
isVisible: true,
132+
},
133+
children: [],
134+
};
135+
const rectCenter = {
136+
x: targetNode.node.rect.left + targetNode.node.rect.width / 2,
137+
y: targetNode.node.rect.top + targetNode.node.rect.height / 2,
138+
};
139+
const element = elementByPositionWithElementInfo(
140+
dump.executions[0].tasks[0].pageContext.tree,
141+
rectCenter,
142+
{
143+
requireStrictDistance: false,
144+
filterPositionElements: true,
145+
},
146+
);
147+
148+
expect(element?.id).toBe(targetNode.node.id);
149+
});
150+
151+
it('should return the correct element at the position with filterPositionElements = false', async () => {
152+
const dumpPath = path.join(
153+
__dirname,
154+
'../../',
155+
'fixtures',
156+
'dump-for-utils-test.json',
157+
);
158+
const dump = JSON.parse(fs.readFileSync(dumpPath, 'utf8'));
159+
const targetNode = {
160+
node: {
161+
content: '选好了',
162+
rect: {
163+
left: 138,
164+
top: 849,
165+
width: 247,
166+
height: 38,
167+
zoom: 1,
168+
isVisible: true,
169+
},
170+
center: [261, 868],
171+
id: 'hdocg',
172+
indexId: 263,
173+
attributes: {
174+
type: 'button',
175+
class: '.submit-btn.ant-btn.ant-btn-primary.ant-btn-lg.ant-btn-block',
176+
htmlTagName: '<button>',
177+
nodeType: 'BUTTON Node',
178+
},
179+
isVisible: true,
180+
},
181+
children: [],
182+
};
183+
const rectCenter = {
184+
x: targetNode.node.rect.left + targetNode.node.rect.width / 2,
185+
y: targetNode.node.rect.top + targetNode.node.rect.height / 2,
186+
};
187+
const element = elementByPositionWithElementInfo(
188+
dump.executions[0].tasks[0].pageContext.tree,
189+
rectCenter,
190+
{
191+
requireStrictDistance: false,
192+
filterPositionElements: false,
193+
},
194+
);
195+
196+
expect(element?.id).not.toBe(targetNode.node.id);
197+
expect(element?.attributes?.nodeType).toBe('POSITION Node');
198+
});
199+
200+
it('should return correct element at the position when strictDistance is true', async () => {
201+
const dumpPath = path.join(
202+
__dirname,
203+
'../../',
204+
'fixtures',
205+
'dump-for-utils-test.json',
206+
);
207+
const dump = JSON.parse(fs.readFileSync(dumpPath, 'utf8'));
208+
const targetNode = {
209+
node: {
210+
content: '选好了',
211+
rect: {
212+
left: 138,
213+
top: 849,
214+
width: 247,
215+
height: 38,
216+
zoom: 1,
217+
isVisible: true,
218+
},
219+
center: [261, 868],
220+
id: 'hdocg',
221+
indexId: 263,
222+
attributes: {
223+
type: 'button',
224+
class: '.submit-btn.ant-btn.ant-btn-primary.ant-btn-lg.ant-btn-block',
225+
htmlTagName: '<button>',
226+
nodeType: 'BUTTON Node',
227+
},
228+
isVisible: true,
229+
},
230+
children: [],
231+
};
232+
const rectCenter = {
233+
x: targetNode.node.rect.left + targetNode.node.rect.width / 2,
234+
y: targetNode.node.rect.top + targetNode.node.rect.height / 2,
235+
};
236+
const element = elementByPositionWithElementInfo(
237+
dump.executions[0].tasks[0].pageContext.tree,
238+
rectCenter,
239+
{
240+
requireStrictDistance: true,
241+
filterPositionElements: true,
242+
},
243+
);
244+
245+
expect(element?.id).toBe(targetNode.node.id);
246+
});
52247
});

packages/shared/src/extractor/locator.ts

Lines changed: 24 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import type { ElementInfo } from '.';
2+
import { isButtonElement, isFormElement } from './dom-util';
23
import { getNodeFromCacheList } from './util';
34
import { getRect, isElementPartiallyInViewport } from './util';
45
import { collectElementInfo } from './web-extractor';
@@ -32,6 +33,21 @@ const getTextNodeIndex = (textNode: Node): number => {
3233
return index;
3334
};
3435

36+
// Helper function to create normalize-space condition
37+
const createNormalizeSpaceCondition = (textContent: string): string => {
38+
return `[normalize-space()="${textContent}"]`;
39+
};
40+
41+
// Helper function to add text content to xpath if applicable
42+
const addTextContentToXPath = (el: Node, baseXPath: string): string => {
43+
const textContent = el.textContent?.trim();
44+
if (textContent && (isButtonElement(el) || isFormElement(el))) {
45+
// add text content for leaf elements before text node
46+
return `${baseXPath}${createNormalizeSpaceCondition(textContent)}`;
47+
}
48+
return baseXPath;
49+
};
50+
3551
const getElementXPath = (element: Node): string => {
3652
// deal with text node
3753
if (element.nodeType === Node.TEXT_NODE) {
@@ -44,7 +60,7 @@ const getElementXPath = (element: Node): string => {
4460

4561
// If we have text content, include it in the xpath for better matching
4662
if (textContent) {
47-
return `${parentXPath}/text()[${textIndex}][normalize-space()="${textContent}"]`;
63+
return `${parentXPath}/text()[${textIndex}]${createNormalizeSpaceCondition(textContent)}`;
4864
}
4965
return `${parentXPath}/text()[${textIndex}]`;
5066
}
@@ -64,20 +80,18 @@ const getElementXPath = (element: Node): string => {
6480
return '/html/body';
6581
}
6682

67-
// If no parent node, return just the tag name
68-
if (!el.parentNode) {
69-
return `/${el.nodeName.toLowerCase()}`;
70-
}
71-
7283
const index = getElementIndex(el);
7384
const tagName = el.nodeName.toLowerCase();
7485

75-
if (el.parentNode) {
76-
const parentXPath = getElementXPath(el.parentNode);
77-
return `${parentXPath}/${tagName}[${index}]`;
86+
// If no parent node, return just the tag name
87+
if (!el.parentNode) {
88+
const baseXPath = `/${tagName}`;
89+
return addTextContentToXPath(el, baseXPath);
7890
}
7991

80-
return `/${tagName}[${index}]`;
92+
const parentXPath = getElementXPath(el.parentNode);
93+
const baseXPath = `${parentXPath}/${tagName}[${index}]`;
94+
return addTextContentToXPath(el, baseXPath);
8195
};
8296

8397
function generateXPaths(node: Node | null): string[] {

0 commit comments

Comments
 (0)