Skip to content

Commit 52c05c5

Browse files
authored
Introduce spam_checker_spammy internal event metadata. (#19453)
Follows: #19365 Part of: MSC4354 Sticky Events (experimental feature #19409) This PR introduces a `spam_checker_spammy` flag, analogous to `policy_server_spammy`, as an explicit flag that an event was decided to be spammy by a spam-checker module. The original Sticky Events PR (#18968) just reused `policy_server_spammy`, but it didn't sit right with me because we (at least appear to be experimenting with features that) allow users to opt-in to seeing `policy_server_spammy` events (presumably for moderation purposes). Keeping these flags separate felt best, therefore. As for why we need this flag: soon soft-failed status won't be permanent, at least for sticky events. The spam checker modules currently work by making events soft-failed. We want to prevent spammy events from getting reconsidered/un-soft-failed, so it seems like we need a flag to track spam-checker spamminess *separately* from soft-failed. Should be commit-by-commit friendly, but is also small. --------- Signed-off-by: Olivier 'reivilibre <oliverw@matrix.org>
1 parent bed00bb commit 52c05c5

11 files changed

Lines changed: 416 additions & 16 deletions

File tree

changelog.d/19453.misc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Introduce `spam_checker_spammy` internal event metadata.

rust/src/events/internal_metadata.rs

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ enum EventInternalMetadataData {
5555
SoftFailed(bool),
5656
ProactivelySend(bool),
5757
PolicyServerSpammy(bool),
58+
SpamCheckerSpammy(bool),
5859
Redacted(bool),
5960
TxnId(Box<str>),
6061
DelayId(Box<str>),
@@ -105,6 +106,13 @@ impl EventInternalMetadataData {
105106
.to_owned()
106107
.into_any(),
107108
),
109+
EventInternalMetadataData::SpamCheckerSpammy(o) => (
110+
pyo3::intern!(py, "spam_checker_spammy"),
111+
o.into_pyobject(py)
112+
.unwrap_infallible()
113+
.to_owned()
114+
.into_any(),
115+
),
108116
EventInternalMetadataData::Redacted(o) => (
109117
pyo3::intern!(py, "redacted"),
110118
o.into_pyobject(py)
@@ -173,6 +181,11 @@ impl EventInternalMetadataData {
173181
.extract()
174182
.with_context(|| format!("'{key_str}' has invalid type"))?,
175183
),
184+
"spam_checker_spammy" => EventInternalMetadataData::SpamCheckerSpammy(
185+
value
186+
.extract()
187+
.with_context(|| format!("'{key_str}' has invalid type"))?,
188+
),
176189
"redacted" => EventInternalMetadataData::Redacted(
177190
value
178191
.extract()
@@ -468,6 +481,17 @@ impl EventInternalMetadata {
468481
set_property!(self, PolicyServerSpammy, obj);
469482
}
470483

484+
#[getter]
485+
fn get_spam_checker_spammy(&self) -> PyResult<bool> {
486+
Ok(get_property_opt!(self, SpamCheckerSpammy)
487+
.copied()
488+
.unwrap_or(false))
489+
}
490+
#[setter]
491+
fn set_spam_checker_spammy(&mut self, obj: bool) {
492+
set_property!(self, SpamCheckerSpammy, obj);
493+
}
494+
471495
#[getter]
472496
fn get_redacted(&self) -> PyResult<bool> {
473497
let bool = get_property!(self, Redacted)?;

synapse/federation/federation_base.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,9 @@ async def _check_sigs_and_hash(
177177
# Note: we don't redact the event so admins can inspect the event after the
178178
# fact. Other processes may redact the event, but that won't be applied to
179179
# the database copy of the event until the server's config requires it.
180-
return pdu
180+
#
181+
# We also *don't* return early here as we would still like to evaluate
182+
# `spam_checker_spammy`, for completeness.
181183

182184
spam_check = await self._spam_checker_module_callbacks.check_event_for_spam(pdu)
183185

@@ -194,6 +196,8 @@ async def _check_sigs_and_hash(
194196
# using the event in prev_events).
195197
redacted_event = prune_event(pdu)
196198
redacted_event.internal_metadata.soft_failed = True
199+
# Mark this as spam so we don't re-evaluate soft-failure status.
200+
redacted_event.internal_metadata.spam_checker_spammy = True
197201
return redacted_event
198202

199203
return pdu

synapse/storage/databases/main/sticky_events.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -302,10 +302,14 @@ def insert_sticky_events_txn(
302302
Skips inserting events:
303303
- if they are considered spammy by the policy server;
304304
(unsure if correct, track: https://github.com/matrix-org/matrix-spec-proposals/pull/4354#discussion_r2727593350)
305+
- if they are considered spammy by a Synapse spam checker module;
305306
- if they are rejected;
306307
- if they are outliers (they should be reconsidered for insertion when de-outliered); or
307308
- if they are not sticky (e.g. if the stickiness expired).
308309
310+
Note: Soft-failed sticky events ARE inserted, as their soft-failed status
311+
could be re-evaluated later.
312+
309313
Skipping the insertion of these types of 'invalid' events is useful for performance reasons because
310314
they would fill up the table yet we wouldn't show them to clients anyway.
311315
@@ -321,7 +325,12 @@ def insert_sticky_events_txn(
321325
sticky_events: list[tuple[EventBase, int]] = []
322326
for ev in events:
323327
# MSC: Note: policy servers and other similar antispam techniques still apply to these events.
324-
if ev.internal_metadata.policy_server_spammy:
328+
# We don't filter out soft-failed events altogether (in case they get re-evaluated later),
329+
# so filter out `spam_checker_spammy` events specifically as we don't want to re-evaluate _those_ later.
330+
if (
331+
ev.internal_metadata.policy_server_spammy
332+
or ev.internal_metadata.spam_checker_spammy
333+
):
325334
continue
326335
# We shouldn't be passed rejected events, but if we do, we filter them out too.
327336
if ev.rejected_reason is not None:
@@ -332,7 +341,7 @@ def insert_sticky_events_txn(
332341
sticky_duration = ev.sticky_duration()
333342
if sticky_duration is None:
334343
continue
335-
# Calculate the end time as start_time + effecitve sticky duration
344+
# Calculate the end time as start_time + effective sticky duration
336345
expires_at = min(ev.origin_server_ts, now_ms) + sticky_duration.as_millis()
337346
# Filter out already expired sticky events
338347
if expires_at <= now_ms:

synapse/synapse_rust/events.pyi

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,19 @@ class EventInternalMetadata:
3838
policy_server_spammy: bool
3939
"""whether the policy server indicated that this event is spammy"""
4040

41+
spam_checker_spammy: bool
42+
"""Whether a spam checker module indicated that this event is spammy
43+
44+
Note that spam checkers also cause the event to be marked as soft-failed.
45+
46+
This flags exists for two reasons:
47+
1. as debugging information
48+
2. to prevent the soft-failed re-evaluation of spammy events
49+
(the re-evaluation behaviour originates from MSC4354 Sticky Events)
50+
51+
Note that historical spammy events won't have this flag.
52+
"""
53+
4154
txn_id: str
4255
"""The transaction ID, if it was set when the event was created."""
4356
delay_id: str

tests/module_api/test_spamchecker.py

Lines changed: 187 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,22 +12,33 @@
1212
# <https://www.gnu.org/licenses/agpl-3.0.html>.
1313
#
1414
#
15+
from http import HTTPStatus
1516
from typing import Literal
1617

1718
from twisted.internet.testing import MemoryReactor
1819

19-
from synapse.api.constants import EventContentFields, EventTypes
20+
from synapse.api.constants import (
21+
EventContentFields,
22+
EventTypes,
23+
Membership,
24+
)
25+
from synapse.api.room_versions import RoomVersions
2026
from synapse.config.server import DEFAULT_ROOM_VERSION
27+
from synapse.events import make_event_from_dict
28+
from synapse.module_api import EventBase
2129
from synapse.rest import admin, login, room, room_upgrade_rest_servlet
2230
from synapse.server import HomeServer
2331
from synapse.types import Codes, JsonDict
2432
from synapse.util.clock import Clock
2533

34+
from tests import unittest
2635
from tests.server import FakeChannel
2736
from tests.unittest import HomeserverTestCase
2837

2938

3039
class SpamCheckerTestCase(HomeserverTestCase):
40+
"""Tests for the spam checker module API."""
41+
3142
servlets = [
3243
room.register_servlets,
3344
admin.register_servlets,
@@ -284,3 +295,178 @@ async def user_may_send_state_event(
284295

285296
self.assertEqual(channel.code, 403)
286297
self.assertEqual(channel.json_body["errcode"], Codes.FORBIDDEN)
298+
299+
300+
class FederatedEventSpamCheckMetadataTestCase(unittest.FederatingHomeserverTestCase):
301+
servlets = [
302+
admin.register_servlets,
303+
login.register_servlets,
304+
room.register_servlets,
305+
]
306+
307+
def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
308+
super().prepare(reactor, clock, hs)
309+
self._module_api = hs.get_module_api()
310+
self._store = hs.get_datastores().main
311+
self._storage_controllers = hs.get_storage_controllers()
312+
self._federation_event_handler = hs.get_federation_event_handler()
313+
self._federation_server = hs.get_federation_server()
314+
self._state_handler = hs.get_state_handler()
315+
self._persistence_controller = hs.get_storage_controllers().persistence
316+
317+
# Create a room
318+
user1_id = self.register_user("user1", "pass")
319+
user1_tok = self.login(user1_id, "pass")
320+
self.room_id = self.helper.create_room_as(
321+
user1_id,
322+
tok=user1_tok,
323+
is_public=True,
324+
room_version=RoomVersions.V10.identifier,
325+
)
326+
327+
# Prepare a join for the 'remote' user
328+
state_map = self.get_success(
329+
self._storage_controllers.state.get_current_state(self.room_id)
330+
)
331+
forward_extremity_event_ids = self.get_success(
332+
self.hs.get_datastores().main.get_latest_event_ids_in_room(self.room_id)
333+
)
334+
self.remote_user_id = f"@remoteuser:{self.OTHER_SERVER_NAME}"
335+
self.remote_user_join_event = make_event_from_dict(
336+
self.add_hashes_and_signatures_from_other_server(
337+
{
338+
"room_id": self.room_id,
339+
"sender": self.remote_user_id,
340+
"state_key": self.remote_user_id,
341+
"depth": 1000,
342+
"origin_server_ts": 1,
343+
"type": EventTypes.Member,
344+
"content": {"membership": Membership.JOIN},
345+
"auth_events": [
346+
state_map[(EventTypes.Create, "")].event_id,
347+
state_map[(EventTypes.JoinRules, "")].event_id,
348+
],
349+
"prev_events": list(forward_extremity_event_ids),
350+
}
351+
),
352+
room_version=RoomVersions.V10,
353+
)
354+
355+
# Send the join
356+
self.get_success(
357+
self._federation_event_handler.on_receive_pdu(
358+
self.OTHER_SERVER_NAME, self.remote_user_join_event
359+
)
360+
)
361+
362+
# Check the join made it to the 'local' view of the room
363+
self.helper.get_event(
364+
room_id=self.room_id,
365+
event_id=self.remote_user_join_event.event_id,
366+
tok=user1_tok,
367+
expect_code=HTTPStatus.OK,
368+
)
369+
370+
def test_federated_events_with_spam_checker_metadata(self) -> None:
371+
"""
372+
Simulates receiving spammy and non-spammy events over federation,
373+
then checks their `spam_checker_spammy` flag is set properly.
374+
"""
375+
376+
async def check_event_for_spam(event: EventBase) -> Literal["NOT_SPAM"] | Codes:
377+
if event.type == EventTypes.Message:
378+
if "ham" not in event.content["body"]:
379+
return Codes.FORBIDDEN
380+
return "NOT_SPAM"
381+
382+
# Register a spam checker callback that only allows messages with 'ham'
383+
self._module_api.register_spam_checker_callbacks(
384+
check_event_for_spam=check_event_for_spam
385+
)
386+
387+
# Prepare a spammy and a non-spammy event.
388+
forward_extremity_event_ids = self.get_success(
389+
self._store.get_latest_event_ids_in_room(self.room_id)
390+
)
391+
state_map = self.get_success(
392+
self._storage_controllers.state.get_current_state(self.room_id)
393+
)
394+
spammy_event = make_event_from_dict(
395+
self.add_hashes_and_signatures_from_other_server(
396+
{
397+
"room_id": self.room_id,
398+
"sender": self.remote_user_id,
399+
"depth": 2000,
400+
"origin_server_ts": 2,
401+
"type": EventTypes.Message,
402+
"content": {"body": "this is spam", "msgtype": "m.text"},
403+
"auth_events": [
404+
state_map[(EventTypes.Create, "")].event_id,
405+
state_map[(EventTypes.JoinRules, "")].event_id,
406+
state_map[(EventTypes.Member, self.remote_user_id)].event_id,
407+
],
408+
"prev_events": list(forward_extremity_event_ids),
409+
}
410+
),
411+
room_version=RoomVersions.V10,
412+
)
413+
non_spammy_event = make_event_from_dict(
414+
self.add_hashes_and_signatures_from_other_server(
415+
{
416+
"room_id": self.room_id,
417+
"sender": self.remote_user_id,
418+
"depth": 2000,
419+
"origin_server_ts": 2,
420+
"type": EventTypes.Message,
421+
"content": {"body": "delicious ham", "msgtype": "m.text"},
422+
"auth_events": [
423+
state_map[(EventTypes.Create, "")].event_id,
424+
state_map[(EventTypes.JoinRules, "")].event_id,
425+
state_map[(EventTypes.Member, self.remote_user_id)].event_id,
426+
],
427+
"prev_events": list(forward_extremity_event_ids),
428+
}
429+
),
430+
room_version=RoomVersions.V10,
431+
)
432+
433+
# Receive these events over federation
434+
# We need to let the federation server have them because it will
435+
# invoke `_check_sigs_and_hash` which invokes the spam checker.
436+
self.get_success(
437+
self._federation_server._handle_received_pdu(
438+
self.OTHER_SERVER_NAME, spammy_event
439+
)
440+
)
441+
self.get_success(
442+
self._federation_server._handle_received_pdu(
443+
self.OTHER_SERVER_NAME, non_spammy_event
444+
)
445+
)
446+
447+
# Retrieve the events from the database
448+
retrieved_spammy_event = self.get_success(
449+
self._store.get_event(spammy_event.event_id, allow_rejected=True)
450+
)
451+
retrieved_non_spammy_event = self.get_success(
452+
self._store.get_event(non_spammy_event.event_id, allow_rejected=True)
453+
)
454+
455+
# Assert the spammy flags (and soft-failed flags, for good measure) are set properly
456+
self.assertTrue(
457+
retrieved_spammy_event.internal_metadata.spam_checker_spammy,
458+
"Spammy inbound event should be marked as spam_checker_spammy!",
459+
)
460+
self.assertTrue(
461+
retrieved_spammy_event.internal_metadata.is_soft_failed(),
462+
"Spammy inbound event should be soft-failed.",
463+
)
464+
465+
self.assertFalse(
466+
retrieved_non_spammy_event.internal_metadata.spam_checker_spammy,
467+
"Non-spammy inbound event should not be marked as spam_checker_spammy!",
468+
)
469+
self.assertFalse(
470+
retrieved_non_spammy_event.internal_metadata.is_soft_failed(),
471+
"Non-spammy inbound event should not be soft-failed.",
472+
)

tests/rest/client/sliding_sync/test_rooms_meta.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
# <https://www.gnu.org/licenses/agpl-3.0.html>.
1313
#
1414
import logging
15+
from typing import Any
1516

1617
from parameterized import parameterized, parameterized_class
1718

@@ -966,7 +967,7 @@ def test_rooms_bump_stamp_backfill(self) -> None:
966967
creator = "@user:other"
967968
room_id = "!foo:other"
968969
room_version = RoomVersions.V10
969-
shared_kwargs = {
970+
shared_kwargs: dict[str, Any] = {
970971
"room_id": room_id,
971972
"room_version": room_version.identifier,
972973
}

tests/storage/test_roommember.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
#
2121
#
2222
import logging
23-
from typing import cast
23+
from typing import Any, cast
2424

2525
from twisted.internet.testing import MemoryReactor
2626

@@ -238,7 +238,7 @@ def test_join_locally_forgotten_room(self) -> None:
238238
creator = "@user:other"
239239
room_id = "!foo:other"
240240
room_version = RoomVersions.V10
241-
shared_kwargs = {
241+
shared_kwargs: dict[str, Any] = {
242242
"room_id": room_id,
243243
"room_version": room_version.identifier,
244244
}

tests/storage/test_sliding_sync_tables.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
#
1919
#
2020
import logging
21-
from typing import cast
21+
from typing import Any, cast
2222

2323
import attr
2424
from parameterized import parameterized
@@ -873,7 +873,7 @@ def test_joined_room_bump_stamp_backfill(self) -> None:
873873
creator = "@user:other"
874874
room_id = "!foo:other"
875875
room_version = RoomVersions.V10
876-
shared_kwargs = {
876+
shared_kwargs: dict[str, Any] = {
877877
"room_id": room_id,
878878
"room_version": room_version.identifier,
879879
}

0 commit comments

Comments
 (0)