Skip to content

Commit 951b444

Browse files
docs: Fix PR comments in PPR
1 parent 3508d8b commit 951b444

File tree

1 file changed

+84
-27
lines changed

1 file changed

+84
-27
lines changed

sources/platform/actors/publishing/monetize/pay_per_result.mdx

Lines changed: 84 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -48,37 +48,69 @@ Set memory limits using `minMemoryMbytes` and `maxMemoryMbytes` in your [`actor.
4848
"actorSpecification": 1,
4949
"name": "name-of-my-scraper",
5050
"version": "0.0",
51-
"minMemoryMbytes": 256,
52-
"maxMemoryMbytes": 4096,
51+
"minMemoryMbytes": 512,
52+
"maxMemoryMbytes": 1024,
5353
}
5454
```
5555

56+
:::note Memory requirements for browser-based scraping
57+
58+
When using browser automation tools like Puppeteer or Playwright for web scraping, increase the memory limits to accommodate the browser's memory usage.
59+
60+
:::
61+
5662
### Implement the `ACTOR_MAX_PAID_DATASET_ITEMS` check
5763

5864
This check prevents your Actor from generating more results than the user has paid for, protecting both you and your users from unexpected costs.
5965

60-
The `ACTOR_MAX_PAID_DATASET_ITEMS` environment variable contains the user-set limit on returned results for paid-per-result Actors. Do not exceed this limit.
66+
The `ACTOR_MAX_PAID_DATASET_ITEMS` environment variable contains the user-set limit on returned results for paid-per-result Actors. Do not exceed this limit. You can see the example implementation in the following code snippets.
6167

6268
<Tabs groupId="main">
6369
<TabItem value="JavaScript" label="JavaScript">
6470

6571
```js
66-
const MAX_ITEMS = Number(process.env.ACTOR_MAX_PAID_DATASET_ITEMS);
72+
import { Actor } from 'apify';
73+
74+
// Use top-level variables with a closure so you don't have to initialize anything
75+
const MAX_ITEMS: number | undefined = Number(process.env.ACTOR_MAX_PAID_DATASET_ITEMS) || undefined;
76+
77+
let isInitialized = false;
78+
let isGettingItemCount = false;
6779
let pushedItemCount = 0;
6880

69-
export const pushDataMaxAware = async (data) => {
70-
// rest of the Actor logic
81+
export const pushDataMaxAware = async (data: Parameters<Actor['pushData']>[0]): Promise<{ shouldStop: boolean }> => {
82+
// If this isn't pay-per-result, just push like normallyå
83+
if (!MAX_ITEMS) {
84+
await Actor.pushData(data);
85+
return { shouldStop: false };
86+
}
87+
88+
// Initialize on the first call so it as standalone function
89+
if (!isInitialized && !isGettingItemCount) {
90+
isGettingItemCount = true;
91+
const dataset = await Actor.openDataset();
92+
const { itemCount } = (await dataset.getInfo())!;
93+
pushedItemCount = itemCount;
94+
isGettingItemCount = false;
95+
isInitialized = true;
96+
}
97+
98+
// Others handlers will wait until initialized which should be few milliseconds
99+
while (!isInitialized) {
100+
await new Promise((resolve) => setTimeout(resolve, 50));
101+
}
71102

72103
const dataAsArray = Array.isArray(data) ? data : [data];
73104
const dataToPush = dataAsArray.slice(0, MAX_ITEMS - pushedItemCount);
74105

75106
if (dataToPush.length) {
107+
// Update the state before 'await' to avoid race conditions
76108
pushedItemCount += dataToPush.length;
77109
await Actor.pushData(dataToPush);
78110
}
79111

80-
// rest of the Actor logic
81-
}
112+
return { shouldStop: pushedItemCount >= MAX_ITEMS };
113+
};
82114
```
83115

84116
</TabItem>
@@ -87,30 +119,55 @@ export const pushDataMaxAware = async (data) => {
87119
```python
88120
import os
89121
from apify import Actor
90-
91-
MAX_ITEMS = int(os.getenv('ACTOR_MAX_PAID_DATASET_ITEMS', 0))
92-
93-
# rest of the Actor logic
94-
95-
async def push_data_max_aware(data, pushed_item_count=0):
96-
data_as_array = data if isinstance(data, list) else [data]
97-
data_to_push = data_as_array[:MAX_ITEMS - pushed_item_count]
98-
99-
if data_to_push:
100-
new_count = pushed_item_count + len(data_to_push)
101-
await Actor.push_data(data_to_push)
102-
return new_count
103-
104-
return pushed_item_count
105-
106-
# rest of the Actor logic
122+
from typing import Union, List, Dict, Any
123+
124+
class PayPerResultManager:
125+
def __init__(self):
126+
self.max_items = int(os.getenv('ACTOR_MAX_PAID_DATASET_ITEMS', 0)) or None
127+
self.is_initialized = False
128+
self.is_getting_item_count = False
129+
self.pushed_item_count = 0
130+
131+
async def push_data_max_aware(self, data: Union[Dict[Any, Any], List[Dict[Any, Any]]]) -> Dict[str, bool]:
132+
# If this isn't pay-per-result, just push like normally
133+
if not self.max_items:
134+
await Actor.push_data(data)
135+
return {'shouldStop': False}
136+
137+
# Initialize on the first call
138+
if not self.is_initialized and not self.is_getting_item_count:
139+
self.is_getting_item_count = True
140+
dataset = await Actor.open_dataset()
141+
dataset_info = await dataset.get_info()
142+
self.pushed_item_count = dataset_info['itemCount']
143+
self.is_getting_item_count = False
144+
self.is_initialized = True
145+
146+
# Others handlers will wait until initialized which should be few milliseconds
147+
while not self.is_initialized:
148+
await Actor.sleep(0.05)
149+
150+
data_as_array = data if isinstance(data, list) else [data]
151+
data_to_push = data_as_array[:self.max_items - self.pushed_item_count]
152+
153+
if data_to_push:
154+
# Update the state before 'await' to avoid race conditions
155+
self.pushed_item_count += len(data_to_push)
156+
await Actor.push_data(data_to_push)
157+
158+
return {'shouldStop': self.pushed_item_count >= self.max_items}
159+
160+
# Create a singleton instance
161+
ppr_manager = PayPerResultManager()
162+
163+
# Convenience function that uses the singleton
164+
async def push_data_max_aware(data: Union[Dict[Any, Any], List[Dict[Any, Any]]]) -> Dict[str, bool]:
165+
return await ppr_manager.push_data_max_aware(data)
107166
```
108167

109168
</TabItem>
110169
</Tabs>
111170

112-
You can find the whole code of implementing this check in this [example](https://github.com/metalwarrior665/max-paid-items-example/blob/master/src/push-data.ts).
113-
114171
### Test your Actor
115172

116173
Test your Actor with various result volumes to determine optimal pricing. Start with minimal datasets (1-100 results) to understand your base costs and ensure the Actor works correctly with small inputs. Then test with typical usage volumes (1,000-10,000 results) to simulate real-world scenarios and identify any performance bottlenecks.

0 commit comments

Comments
 (0)