Skip to content

Commit

Permalink
Overhaul algorithm, improve wordlists (0.7.17)
Browse files Browse the repository at this point in the history
  • Loading branch information
finnbear committed Dec 9, 2023
1 parent 9cf04fe commit 9502430
Show file tree
Hide file tree
Showing 16 changed files with 11,846 additions and 5,893 deletions.
4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[package]
name = "rustrict"
authors = ["Finn Bear"]
version = "0.7.16"
version = "0.7.17"
edition = "2021"
license = "MIT OR Apache-2.0"
repository = "https://github.com/finnbear/rustrict/"
Expand Down Expand Up @@ -71,7 +71,7 @@ serde = {version = "1", features=["derive"], optional = true}
rand = "0.8"
csv = "1.1"
censor_crate = {package = "censor", version = "0.3.0"}
rustrict_old = {package = "rustrict", version = "0.7.12"}
rustrict_old = {package = "rustrict", version = "0.7.15"}
serial_test = "0.5"
bincode = "1.3.3"
serde_json = "1"
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
- Performant
- O(n) analysis and censoring
- No `regex` (uses custom trie)
- 4 MB/s in `release` mode
- 3 MB/s in `release` mode
- 100 KB/s in `debug` mode

## Limitations
Expand Down Expand Up @@ -177,7 +177,7 @@ is used as a dataset. Positive accuracy is the percentage of profanity detected

| Crate | Accuracy | Positive Accuracy | Negative Accuracy | Time |
|-------|----------|-------------------|-------------------|------|
| [rustrict](https://crates.io/crates/rustrict) | 87.88% | 93.33% | 86.52% | 8s |
| [rustrict](https://crates.io/crates/rustrict) | 80.18% | 93.93% | 76.76% | 8s |
| [censor](https://crates.io/crates/censor) | 76.16% | 72.76% | 77.01% | 23s |

## Development
Expand Down
98 changes: 43 additions & 55 deletions src/censor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -403,22 +403,8 @@ impl<I: Iterator<Item = char>> Iterator for Censor<I> {
.inline
.uppercase
.saturating_add(raw_c.is_uppercase() as u8);
/*
// Very old whitelist (allows a ton of abuse):
let skippable = match c {
' ' | '~' | '-' | '−' | '_' | '.' | '!' | '?' | ',' | '*' | '"' | '\'' | '\n' | '\r'
| '\t' => true,
_ => false,
};
// More recent whitelist (still allows abuse like f^u^c^k):
let skippable = raw_c.is_punctuation()
|| raw_c.is_separator()
|| is_whitespace(raw_c)
|| matches!(raw_c, '(' | ')');
// Use a blacklist instead:
*/
let skippable = !raw_c.is_alphanumeric() || is_whitespace(raw_c);

let skippable = !raw_c.is_alphabetic() || is_whitespace(raw_c);
let replacement = self.options.replacements.get(raw_c);

#[cfg(feature = "trace")]
Expand Down Expand Up @@ -477,6 +463,7 @@ impl<I: Iterator<Item = char>> Iterator for Censor<I> {
spaces: 0,
skipped: 0,
replacements: 0,
repetitions: 0,
low_confidence_replacements: 0,
});
}
Expand Down Expand Up @@ -504,7 +491,7 @@ impl<I: Iterator<Item = char>> Iterator for Censor<I> {
.chars()
{
// This replacement (uppercase to lower case) raises absolutely zero suspicion.
let benign_replacement = c == raw_c_lower;
let benign_replacement = c == raw_c || c == raw_c_lower;

// This counts as a replacement, mainly for spam detection purposes.
let countable_replacement = !(replacement_counted
Expand Down Expand Up @@ -545,28 +532,23 @@ impl<I: Iterator<Item = char>> Iterator for Censor<I> {

#[cfg(feature = "trace")]
println!(
" - Consider match \"{}\" with spaces={}",
m.node.trace, m.spaces
" - Consider match \"{}\" with spaces={}, replacements={}",
m.node.trace, m.spaces, m.replacements
);

if (skippable || c == m.last) && m.start != pos.unwrap_or(0) {
// Undo remove.
#[cfg(feature = "trace")]
println!("undo remove \"{}\" where last={}, node last={:?} and initial spaces={}", m.node.trace, m.last, m.node.last, m.spaces);

let new_repetition = c == m.last;
if (skippable || new_repetition) && m.start != pos.unwrap_or(0) {
// Here, '.' is primarily for allowing ellipsis ("...") as a form of
// space.
// ( and ) are for ignoring appositive phrases.
// Checking node.last is to collapse multiple spaces into one, to avoid
// Checking node.last is to collapse multiple spaces into one
let new_space = matches!(c, ' ' | '.' | ',' | ':' | ';' | '…' | '(' | ')')
// && skippable
&& m.node.last != Some(' ');
// && !ignore_sep;

let new_skip = skippable && !ignore_sep;
let new_replacement = !benign_replacement && !self.inline.separate;
let new_skip = !new_space && skippable && !ignore_sep;
// dil -> dii
let new_replacement = c == m.last && raw_c != c;
let new_low_confidence_replacement =
!benign_replacement && raw_c.is_ascii_digit();
new_replacement && raw_c.is_ascii_digit();

let undo_m = Match {
spaces: m.spaces.saturating_add(new_space as u8),
Expand All @@ -575,8 +557,12 @@ impl<I: Iterator<Item = char>> Iterator for Censor<I> {
low_confidence_replacements: m
.low_confidence_replacements
.saturating_add(new_low_confidence_replacement as u8),
repetitions: m.repetitions.saturating_add(new_repetition as u8),
..m
};
#[cfg(feature = "trace")]
println!(" (keep with last={}, node last={:?}, spaces={}, skip={}, repl={}, repet={})", undo_m.last, undo_m.node.last, undo_m.spaces, undo_m.skipped, undo_m.replacements, undo_m.repetitions);

if let Some(existing) = self.allocated.matches.get(&undo_m) {
let replacement = existing.combine(&undo_m);
self.allocated.matches.replace(replacement);
Expand All @@ -586,26 +572,26 @@ impl<I: Iterator<Item = char>> Iterator for Censor<I> {
}

if let Some(next) = m.node.children.get(&c) {
let new_replacement = !benign_replacement && (c != raw_c) && c != ' ';
let new_low_confidence_replacement =
new_replacement && raw_c.is_ascii_digit();
let new_space =
!new_replacement && (raw_c != c && self.inline.separate && c != '\'');

let next_m = Match {
node: next,
spaces: m.spaces.saturating_add(
(c != raw_c && self.inline.separate && c != '\'') as u8,
),
replacements: m.replacements.saturating_add(
(!benign_replacement && !self.inline.separate) as u8,
),
spaces: m.spaces.saturating_add(new_space as u8),
replacements: m.replacements.saturating_add(new_replacement as u8),
low_confidence_replacements: m
.low_confidence_replacements
.saturating_add(
(!benign_replacement && raw_c.is_ascii_digit()) as u8,
),
.saturating_add(new_low_confidence_replacement as u8),
last: c,
..m
};

#[cfg(feature = "trace")]
println!(
" - Next is \"{}\", with spaces={}, replacements = {}",
" - Next is \"{}\", with spaces={}, replacements={}",
next.trace, next_m.spaces, next_m.replacements
);

Expand All @@ -614,6 +600,7 @@ impl<I: Iterator<Item = char>> Iterator for Censor<I> {
&& next_m.start == 0
&& next_m.spaces == 0
&& next_m.skipped == 0
&& next_m.replacements == 0
&& !self.options.ignore_false_positives
{
// Everything in the input until now is safe.
Expand All @@ -622,6 +609,7 @@ impl<I: Iterator<Item = char>> Iterator for Censor<I> {
self.inline.safe = true;
}

/*
#[cfg(feature = "trace")]
if !next_m.node.typ.is(Type::ANY) {
if self.options.ignore_false_positives {
Expand All @@ -634,6 +622,7 @@ impl<I: Iterator<Item = char>> Iterator for Censor<I> {
next_m.node.trace, next_m.spaces, next_m.skipped, next_m.replacements
);
}
*/

if next_m.node.typ.is(Type::ANY) {
self.allocated.pending_commit.push(Match {
Expand All @@ -643,6 +632,7 @@ impl<I: Iterator<Item = char>> Iterator for Censor<I> {
} else if next_m.spaces == 0
&& next_m.skipped == 0
&& next_m.replacements == 0
&& next_m.repetitions == 0 // as se
&& !self.options.ignore_false_positives
{
// Is false positive, so invalidate internal matches.
Expand Down Expand Up @@ -954,13 +944,6 @@ mod tests {
}
}

#[test]
#[serial]
fn issue_1() {
// https://github.com/finnbear/rustrict/issues/1#issuecomment-1024426326
assert!("I could say I miss you but it’s not the truth".isnt(Type::ANY));
}

#[test]
#[serial]
fn curated() {
Expand Down Expand Up @@ -1007,21 +990,26 @@ mod tests {

if any != any_truth {
find_detection(case);
failures.push(format!("FAIL: Predicted {:?} for {}", typ, case));
}
if !any_truth {
failures.push(format!("FAIL: Predicted {:?} for: \"{}\"", typ, case));
} else if !any_truth {
// None of the current test cases contain any abusive Unicode characters.
assert_eq!(case, case.censor());
let censored = case.censor();
if case != censored {
failures.push(format!("Censored: : \"{case}\" -> {censored}"))
}
}
if let Some(safe_truth) = safe_truth {
if safe != safe_truth {
panic!("FAIL: Predicted safe={} for {}", safe, case);
failures.push(format!("FAIL: Predicted safe={} for: \"{}\"", safe, case));
}
}
}

if !failures.is_empty() {
panic!("{failures:?}");
for failure in failures {
println!("{failure}");
}
panic!();
}
}

Expand Down Expand Up @@ -1140,7 +1128,7 @@ mod tests {
"https://crates.io/crates/rustrict",
rustrict,
false, // true,
None, // Some(rustrict_old),
Some(rustrict_old),
);
print_accuracy("https://crates.io/crates/censor", censor, false, None);
}
Expand Down
13 changes: 13 additions & 0 deletions src/dictionary_blacklist.txt
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,12 @@ blackballs
blackbreast
blackbutt
blackcock
black guy
black men
black monkey
black monkeys
black people
black women
blowhards
blow job
blow jobs
Expand Down Expand Up @@ -124,6 +128,7 @@ buggeries
buggering
buggers
buggery
bukkake
bullet vibe
bullshit(.*)
bums
Expand Down Expand Up @@ -180,6 +185,7 @@ cripples
cts
cummer
cummers
cumshots
cunts
d
damned
Expand Down Expand Up @@ -317,6 +323,7 @@ gs
gyppo
h
hand job
handjobs
hater
haters
hates
Expand Down Expand Up @@ -443,6 +450,7 @@ menstruates
menstruations
mi ger
micropenis
milfs
missionary position
mo ron
mofos
Expand Down Expand Up @@ -618,6 +626,7 @@ seminude
sex(.*)
sh
shaved beaver
shemales
shi
shiite
shita
Expand Down Expand Up @@ -690,10 +699,12 @@ touch kids
tp
tr
trannie
tranny
ts
turds
tussis
twats
twinks
twits
u
ug
Expand All @@ -713,6 +724,7 @@ unnaked
unpregnant
unsexed
unsexual
upskirts
urethras
urines
ur mom
Expand All @@ -733,6 +745,7 @@ vibrators
virgins
vixens
voyeurs
voyeurweb
vulvae
vulvas
w
Expand Down
1 change: 1 addition & 0 deletions src/dictionary_common_valid_short.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ one
of
or
it
its
my
not
she
Expand Down
Loading

0 comments on commit 9502430

Please sign in to comment.