Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor(SoM): setOfMarks Overlays unify into setOfMarksOverlays function #82

Merged
merged 4 commits into from
Feb 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
130 changes: 130 additions & 0 deletions src/main/shared/setOfMarks.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
/*
* Copyright (c) 2025 Bytedance, Inc. and its affiliates.
* SPDX-License-Identifier: Apache-2.0
*/
import { test, expect } from 'vitest';
import { setOfMarksOverlays } from './setOfMarks';
// import { sleep } from '@ui-tars/shared/utils';

const testMakeScreenMarker = () => {
let xPos;
let yPos;
const actions = [
{
action_type: 'double_click',
action_inputs: {
start_box: '[0.1171875,0.20833333,0.1171875,0.20833333]',
},
reflection: 'reflection',
thought: 'thought',
},
{
action_type: 'type',
action_inputs: {
content: 'Hello, world!',
},
reflection: 'reflection',
thought: 'thought',
},
{
action_type: 'drag',
action_inputs: {
start_box: '[0.1171875,0.20833333,0.1171875,0.20833333]',
end_box: '[0.175,0.647,0.175,0.647]',
},
reflection: 'reflection',
thought: 'thought',
},
{
reflection: '',
thought:
'我已经在搜索框中输入了"杭州天气",但还需要按下回车键来执行搜索。现在需要按下回车键来提交搜索请求,这样就能看到杭州的天气信息。',
action_type: 'hotkey',
action_inputs: { key: 'ctrl enter' },
},
{
reflection: '',
thought:
'To narrow down the search results to cat litters within the specified price range of $18 to $32, I need to adjust the price filter. The next logical step is to drag the left handle of the price slider to set the minimum price to $18, ensuring that only products within the desired range are displayed.\n' +
'Drag the left handle of the price slider to set the minimum price to $18.',
action_type: 'drag',
action_inputs: {
start_box: '[0.072,0.646,0.072,0.646]',
end_box: '[0.175,0.647,0.175,0.647]',
},
},
{
reflection: null,
thought:
'我看到桌面上有Google Chrome的图标,要完成打开Chrome的任务,我需要双击该图标。在之前的操作中,我已经双击了Chrome图标,但是页面没有发生变化,我应该等待一段时间,等待页面加载完成。',
action_type: 'wait',
action_inputs: {},
},
];
for (const action of actions) {
const { overlays } = setOfMarksOverlays({
predictions: [action],
screenshotContext: {
width: 2560,
height: 1440,
},
xPos,
yPos,
});
console.log('overlays', overlays);
// for (let i = 0; i < overlays.length; i++) {
// const overlay = overlays[i];
// const currentOverlay = new BrowserWindow({
// width: overlay.boxWidth || 200,
// height: overlay.boxHeight || 200,
// transparent: true,
// frame: false,
// alwaysOnTop: true,
// skipTaskbar: true,
// focusable: false,
// hasShadow: false,
// thickFrame: false,
// paintWhenInitiallyHidden: true,
// type: 'panel',
// webPreferences: {
// nodeIntegration: true,
// contextIsolation: false,
// },
// });
// currentOverlay.webContents.openDevTools();
// if (overlay.xPos && overlay.yPos && overlay.svg) {
// currentOverlay.setPosition(
// overlay.xPos + overlay.offsetX,
// overlay.yPos + overlay.offsetY,
// );
// xPos = overlay.xPos;
// yPos = overlay.yPos;
// currentOverlay.loadURL(`data:text/html;charset=UTF-8,
// <html>
// <head>
// <style>
// html, body {
// background: transparent;
// margin: 0;
// padding: 0;
// overflow: hidden;
// width: 100%;
// height: 100%;
// }
// </style>
// </head>
// <body>
// ${overlay.svg}
// </body>
// </html>
// `);
// }
// await sleep(1000);
// currentOverlay.close();
// }
}
};

test('not throw error', () => {
expect(() => testMakeScreenMarker()).not.toThrow();
});
206 changes: 206 additions & 0 deletions src/main/shared/setOfMarks.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
/*
* Copyright (c) 2025 Bytedance, Inc. and its affiliates.
* SPDX-License-Identifier: Apache-2.0
*/

import { Conversation, PredictionParsed } from '@ui-tars/shared/types';
import { parseBoxToScreenCoords } from '@main/utils/coords';

export interface Overlay {
prediction: PredictionParsed;
xPos: number | undefined;
offsetX: number;
yPos: number | undefined;
offsetY: number;
boxWidth: number | undefined;
boxHeight: number | undefined;
svg: string;
}

/**
* set of marks overlays, action highlights
* @param predictions PredictionParsed[]
* @param screenshotContext screenWidth, screenHeight
* @returns Overlay[]
*/
export const setOfMarksOverlays = ({
predictions,
screenshotContext,
xPos,
yPos,
}: {
predictions: PredictionParsed[];
screenshotContext: NonNullable<Conversation['screenshotContext']>['size'];
xPos?: number;
yPos?: number;
}): {
overlays: Overlay[];
} => {
const overlays: Overlay[] = [];
const { width, height } = screenshotContext;

for (const prediction of predictions) {
let boxWidth: number;
let boxHeight: number;
switch (prediction.action_type) {
case 'click':
case 'left_click':
case 'left_single':
case 'left_double':
case 'double_click':
if (prediction.action_inputs?.start_box) {
const coords = parseBoxToScreenCoords(
prediction.action_inputs?.start_box,
width,
height,
);
const clickX = coords.x;
const clickY = coords.y;
boxWidth = 250;
boxHeight = 100;

xPos = Math.floor(clickX);
yPos = Math.floor(clickY);
overlays.push({
prediction,
xPos,
offsetX: -boxWidth / 2,
yPos,
offsetY: -boxHeight / 2,
boxWidth,
boxHeight,
svg: `<svg xmlns="http://www.w3.org/2000/svg" width="${boxWidth}" height="${boxHeight}" viewBox="0 0 ${boxWidth} ${boxHeight}">
<circle
cx="${boxWidth / 2}"
cy="${boxHeight / 2}"
r="16"
fill="none"
stroke="red"
stroke-width="3"
stroke-dasharray="80 20"
stroke-linecap="round">
<animateTransform
attributeName="transform"
type="rotate"
from="0 ${boxWidth / 2} ${boxHeight / 2}"
to="360 ${boxWidth / 2} ${boxHeight / 2}"
dur="1s"
repeatCount="indefinite"
/>
</circle>
<circle
cx="${boxWidth / 2}"
cy="${boxHeight / 2}"
r="3"
fill="red"
/>
<text
x="${boxWidth / 2 + 65}"
y="${boxHeight / 2}"
font-family="-apple-system, BlinkMacSystemFont, Arial, sans-serif"
font-size="16"
fill="red"
text-anchor="middle"
dominant-baseline="middle"
>${prediction.action_type}</text>
</svg>`,
});
}
break;
case 'type':
boxWidth = 400;
boxHeight = 100;

const { content } = prediction.action_inputs || {};

overlays.push({
prediction,
xPos,
offsetX: 0,
yPos,
offsetY: -boxHeight / 2,
boxWidth,
boxHeight,
svg: `<svg xmlns="http://www.w3.org/2000/svg" width="${boxWidth}" height="${boxHeight}" viewBox="0 0 ${boxWidth} ${boxHeight}">
<text
x="${boxWidth / 2}"
y="${boxHeight / 2}"
font-family="-apple-system, BlinkMacSystemFont, Arial, sans-serif"
font-size="16"
fill="red"
text-anchor="middle"
dominant-baseline="middle"
>Typing: "${content}"</text>
</svg>`,
});
break;
case 'hotkey':
boxWidth = 200;
boxHeight = 100;

const { key = '' } = prediction.action_inputs || {};
const keys = key.split(' ').join(' + ');

overlays.push({
prediction,
offsetX: 0,
xPos,
yPos,
offsetY: -boxHeight / 2,
boxWidth,
boxHeight,
svg: `<svg xmlns="http://www.w3.org/2000/svg" width="${boxWidth}" height="${boxHeight}" viewBox="0 0 ${boxWidth} ${boxHeight}">
<text
x="${boxWidth / 2}"
y="${boxHeight / 2}"
font-family="-apple-system, BlinkMacSystemFont, Arial, sans-serif"
font-size="16"
fill="red"
text-anchor="middle"
dominant-baseline="middle"
>Hotkey: ${keys}</text>
</svg>`,
});
break;
// case 'scroll':
// break;
// case 'left_click_drag':
// case 'drag':
// case 'select':
// break;
// case 'hover':
// case 'mouse_move':
// break;
// case 'wait':
// break;
default:
boxWidth = 100;
boxHeight = 100;
overlays.push({
prediction,
xPos,
offsetX: 0,
yPos,
offsetY: -boxHeight / 2,
boxWidth,
boxHeight,
svg: `<svg xmlns="http://www.w3.org/2000/svg" width="${boxWidth}" height="${boxHeight}" viewBox="0 0 ${boxWidth} ${boxHeight}">
<text
x="${boxWidth / 2}"
y="${boxHeight / 2}"
font-family="-apple-system, BlinkMacSystemFont, Arial, sans-serif"
font-size="16"
fill="red"
text-anchor="middle"
dominant-baseline="middle"
>${prediction.action_type}</text>
</svg>`,
});
break;
}
}

return {
overlays,
};
};
17 changes: 8 additions & 9 deletions src/main/store/runAgent.ts
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
showPauseButton();
showScreenWaterFlow();

agent.on('data', (data) => {
agent.on('data', async (data) => {

Check warning on line 50 in src/main/store/runAgent.ts

View check run for this annotation

Codecov / codecov/patch

src/main/store/runAgent.ts#L50

Added line #L50 was not covered by tests
const { status, conversations, ...restUserData } = data;

const {
Expand All @@ -65,21 +65,20 @@
'\n========',
);

// 使用封装后的方法显示标记
setState({

Check warning on line 68 in src/main/store/runAgent.ts

View check run for this annotation

Codecov / codecov/patch

src/main/store/runAgent.ts#L68

Added line #L68 was not covered by tests
...getState(),
status,
restUserData,
messages: [...(getState().messages || []), ...conversations],
});

if (
predictionParsed?.length &&
screenshotContext?.size &&
!abortController?.signal?.aborted
) {
showPredictionMarker(predictionParsed, screenshotContext.size);
}

setState({
...getState(),
status,
restUserData,
messages: [...(getState().messages || []), ...conversations],
});
});

agent.on('error', (e) => {
Expand Down
Loading
Loading