Dr0ptp4kt has uploaded a new change for review.
https://gerrit.wikimedia.org/r/196298
Change subject: Address nested parentheses in Share a Fact
......................................................................
Address nested parentheses in Share a Fact
Additionally, trim whitespace before semicolons.
Finally, use autoreleasepools, as a reviewer had requested.
Change-Id: I3c26877c7e4e220e84af9f192423384ea3b5c64e
---
M WikipediaUnitTests/NSString+WMFHTMLParsingTests.m
M wikipedia/Categories/NSString+WMFHTMLParsing.m
2 files changed, 112 insertions(+), 93 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/apps/ios/wikipedia
refs/changes/98/196298/1
diff --git a/WikipediaUnitTests/NSString+WMFHTMLParsingTests.m
b/WikipediaUnitTests/NSString+WMFHTMLParsingTests.m
index 698180d..f4dd719 100644
--- a/WikipediaUnitTests/NSString+WMFHTMLParsingTests.m
+++ b/WikipediaUnitTests/NSString+WMFHTMLParsingTests.m
@@ -33,8 +33,10 @@
}
- (void)testAdequateSnippet {
- NSString *string = @"<p>Dog (woof) [horse] adequately long string</p>";
- XCTAssertEqualObjects([string wmf_getStringSnippetWithoutHTML], @"Dog
adequately long string");
+ NSString *string = @"<p>Dog (woof (w00t)) [horse] adequately long string
historically 40 characters.</p>";
+ NSString *result = [string wmf_getStringSnippetWithoutHTML];
+ XCTAssertEqualObjects([string wmf_getStringSnippetWithoutHTML],
+ @"Dog adequately long string historically 40
characters.");
}
@end
diff --git a/wikipedia/Categories/NSString+WMFHTMLParsing.m
b/wikipedia/Categories/NSString+WMFHTMLParsing.m
index 9c06619..0e5663b 100644
--- a/wikipedia/Categories/NSString+WMFHTMLParsing.m
+++ b/wikipedia/Categories/NSString+WMFHTMLParsing.m
@@ -42,100 +42,117 @@
}
+ (NSString*)wmf_stringSnippetSimplifiedInString:(NSString*)string {
- NSString* result = [string
stringByReplacingOccurrencesOfString:@"&" withString:@"&"];
- NSError* err = nil;
- NSRegularExpression* newlinesRegex = [NSRegularExpression
-
regularExpressionWithPattern:@"\n{2,}"
- options:0
- error:&err];
- NSRange range = NSMakeRange(0, result.length);
- result = [newlinesRegex stringByReplacingMatchesInString:result
+ @autoreleasepool {
+ NSString* result = [string
stringByReplacingOccurrencesOfString:@"&" withString:@"&"];
+ NSError* err = nil;
+ NSRegularExpression* newlinesRegex = [NSRegularExpression
+
regularExpressionWithPattern:@"\n{2,}"
+ options:0
+ error:&err];
+ NSRange range = NSMakeRange(0, result.length);
+ result = [newlinesRegex stringByReplacingMatchesInString:result
+ options:0
+ range:range
+ withTemplate:@"\n"];
+
+
+ // We probably don't want to try to handle ideographic parens
+ err = nil;
+ NSRegularExpression* parensRegex = [NSRegularExpression
+
regularExpressionWithPattern:@"[(][^()]+[)]"
+ options:0
+ error:&err];
+
+ result = [NSString wmf_recursivelyUpdateString:result
withRegex:parensRegex];
+
+ // Nor do we want to try to handle ideographic brackets
+ err = nil;
+ NSRegularExpression* bracketsRegex = [NSRegularExpression
+
regularExpressionWithPattern:@"\\[[^]]+]"
+ options:0
+ error:&err];
+
+ range = NSMakeRange(0, result.length);
+ result = [bracketsRegex stringByReplacingMatchesInString:result
+ options:0
+ range:range
+ withTemplate:@""];
+
+ // Unlike parens and brackets and unlike doubled up space in general,
+ // we do not want whitespace preceding the comma, ideographic comma,
+ // or semicolon
+ err = nil;
+ NSRegularExpression* whitespaceCommaSemicolonRegex =
[NSRegularExpression
+
regularExpressionWithPattern:@"\\s+([,、;])"
options:0
- range:range
- withTemplate:@"\n"];
-
-
- // We probably don't want to try to handle ideographic parens
- err = nil;
- NSRegularExpression* parensRegex = [NSRegularExpression
-
regularExpressionWithPattern:@"[(][^)]+[)]"
- options:0
- error:&err];
- range = NSMakeRange(0, result.length);
- result = [parensRegex stringByReplacingMatchesInString:result
- options:0
- range:range
- withTemplate:@""];
-
- // Nor do we want to try to handle ideographic brackets
- err = nil;
- NSRegularExpression* bracketsRegex = [NSRegularExpression
-
regularExpressionWithPattern:@"\\[[^]]+]"
- options:0
- error:&err];
- range = NSMakeRange(0, result.length);
- result = [bracketsRegex stringByReplacingMatchesInString:result
- options:0
- range:range
- withTemplate:@""];
-
- // Unlike parens and brackets and unlike doubled up space in general,
- // we do not want whitespace preceding the comma or ideographic comma
- err = nil;
- NSRegularExpression* whitespaceCommaRegex = [NSRegularExpression
-
regularExpressionWithPattern:@"\\s+([,、])"
+ error:&err];
+ range = NSMakeRange(0, result.length);
+ result = [whitespaceCommaSemicolonRegex
stringByReplacingMatchesInString:result
+ options:0
+ range:range
+ withTemplate:@"$1"];
+
+ // Ideographic stops from TextExtracts, which were from OpenSearch
+ err = nil;
+ NSRegularExpression* whitespacePeriodRegex = [NSRegularExpression
+
regularExpressionWithPattern:@"\\s+([\\.|。|.|。])"
+ options:0
+ error:&err];
+ range = NSMakeRange(0, result.length);
+ result = [whitespacePeriodRegex stringByReplacingMatchesInString:result
+ options:0
+ range:range
+
withTemplate:@"$1"];
+
+ // In practice, we rarely care about doubled up whitespace in the
+ // string except for the actual space character
+ err = nil;
+ NSRegularExpression* spacesRegex = [NSRegularExpression
+ regularExpressionWithPattern:@"
{2,}"
+ options:0
+ error:&err];
+ range = NSMakeRange(0, result.length);
+ result = [spacesRegex stringByReplacingMatchesInString:result
+ options:0
+ range:range
+ withTemplate:@" "];
+
+ // Note about trailing colon characters: they usually look strange if
kept,
+ // and removing them (plus spaces and newlines) doesn't often create
merged
+ // words that look bad - these are usually at tag boundaries. For
Latinized
+ // langs sometimes this means words like "include" finish the snippet.
+ // But as a matter of markup structure, something like a <p> tag
+ // shouldn't be </p> closed until something like <ul>...</ul> is
closed.
+ // In fact, some sections have this layout, and some do not.
+ err = nil;
+ NSRegularExpression* leadingTrailingWhitespaceNewlineRegex =
[NSRegularExpression
+
regularExpressionWithPattern:@"^[\\s\n]+|[\\s\n:]+$"
options:0
-
error:&err];
- range = NSMakeRange(0, result.length);
- result = [whitespaceCommaRegex stringByReplacingMatchesInString:result
- options:0
- range:range
- withTemplate:@"$1"];
+
error:&err];
+ range = NSMakeRange(0, result.length);
+ result = [leadingTrailingWhitespaceNewlineRegex
stringByReplacingMatchesInString:result
+
options:0
+
range:range
+
withTemplate:@""];
+
+ return result;
+ }
+}
- // Ideographic stops from TextExtracts, which were from OpenSearch
- err = nil;
- NSRegularExpression* whitespacePeriodRegex = [NSRegularExpression
-
regularExpressionWithPattern:@"\\s+([\\.|。|.|。])"
-
options:0
-
error:&err];
- range = NSMakeRange(0, result.length);
- result = [whitespacePeriodRegex stringByReplacingMatchesInString:result
- options:0
- range:range
- withTemplate:@"$1"];
-
- // In practice, we rarely care about doubled up whitespace in the
- // string except for the actual space character
- err = nil;
- NSRegularExpression* spacesRegex = [NSRegularExpression
- regularExpressionWithPattern:@" {2,}"
- options:0
- error:&err];
- range = NSMakeRange(0, result.length);
- result = [spacesRegex stringByReplacingMatchesInString:result
- options:0
- range:range
- withTemplate:@" "];
-
- // Note about trailing colon characters: they usually look strange if kept,
- // and removing them (plus spaces and newlines) doesn't often create merged
- // words that look bad - these are usually at tag boundaries. For Latinized
- // langs sometimes this means words like "include" finish the snippet.
- // But as a matter of markup structure, something like a <p> tag
- // shouldn't be </p> closed until something like <ul>...</ul> is closed.
- // In fact, some sections have this layout, and some do not.
- err = nil;
- NSRegularExpression* leadingTrailingWhitespaceNewlineRegex =
[NSRegularExpression
-
regularExpressionWithPattern:@"^[\\s\n]+|[\\s\n:]+$"
-
options:0
-
error:&err];
- range = NSMakeRange(0, result.length);
- result = [leadingTrailingWhitespaceNewlineRegex
stringByReplacingMatchesInString:result
-
options:0
-
range:range
-
withTemplate:@""];
-
- return result;
++wmf_recursivelyUpdateString : (NSString*)string withRegex :
(NSRegularExpression*)regex {
+ NSString* oldResult;
+ NSRange range;
+ @autoreleasepool {
+ do {
+ oldResult = [string copy];
+ range = NSMakeRange(0, string.length);
+ string = [regex stringByReplacingMatchesInString:string
+ options:0
+ range:range
+ withTemplate:@""];
+ } while (![oldResult isEqualToString:string]);
+ return string;
+ }
}
@end
--
To view, visit https://gerrit.wikimedia.org/r/196298
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I3c26877c7e4e220e84af9f192423384ea3b5c64e
Gerrit-PatchSet: 1
Gerrit-Project: apps/ios/wikipedia
Gerrit-Branch: master
Gerrit-Owner: Dr0ptp4kt <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits