EBernhardson has uploaded a new change for review.
https://gerrit.wikimedia.org/r/286758
Change subject: Ignore results that arnt valid wiki articles
......................................................................
Ignore results that arnt valid wiki articles
Occasionally some engines return urls that point to search, or to the
top level domain. Ignore them rather than bailing out.
Change-Id: Ifadfc60cf96cbe6463457c217d72764c9d79749b
---
M src/RelevanceScoring/Import/HtmlResultGetter.php
A tests/unit/RelevanceScoring/Import/HtmlResultGetter.php
2 files changed, 131 insertions(+), 9 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/wikimedia/discovery/discernatron
refs/changes/58/286758/1
diff --git a/src/RelevanceScoring/Import/HtmlResultGetter.php
b/src/RelevanceScoring/Import/HtmlResultGetter.php
index c287881..6db5bb2 100644
--- a/src/RelevanceScoring/Import/HtmlResultGetter.php
+++ b/src/RelevanceScoring/Import/HtmlResultGetter.php
@@ -58,10 +58,6 @@
]);
}
- private function getWikiDomain($wiki)
- {
- return parse_url($this->wikis[$wiki], PHP_URL_HOST);
- }
/**
* @param ResponseInterface $response
* @param string $wiki
@@ -74,7 +70,6 @@
public function handleResponse(ResponseInterface $response, $wiki, $query)
{
if ($response->getStatusCode() !== 200) {
- var_dump($response);
throw new RuntimeException('Failed search');
}
@@ -87,16 +82,14 @@
throw new RuntimeException('No results section');
}
- $domain = strtolower($this->getWikiDomain($wiki));
$results = [];
foreach ($doc[$this->selectors['results']] as $result) {
$pq = \pq($result);
$url = $pq[$this->selectors['url']]->attr('href');
- $urlDomain = strtolower(parse_url($url, PHP_URL_HOST));
- if ($urlDomain === $domain) {
+ if ($this->isValidWikiArticle($wiki, $url)) {
$results[] = ImportedResult::createFromURL(
$this->source,
- $pq[$this->selectors['url']]->attr('href'),
+ $url,
$pq[$this->selectors['snippet']]->text(),
count($results)
);
@@ -105,4 +98,40 @@
return $results;
}
+
+ /**
+ * @param string $wiki
+ * @return string
+ */
+ private function getWikiDomain($wiki)
+ {
+ return parse_url($this->wikis[$wiki], PHP_URL_HOST);
+ }
+
+ /**
+ * @param string $url
+ * @return bool
+ */
+ private function isValidWikiArticle($wiki, $url)
+ {
+
+ $parts = parse_url($url);
+
+ $domain = strtolower($this->getWikiDomain($wiki));
+ $urlDomain = strtolower($parts['host']);
+ if ($urlDomain !== $domain) {
+ return false;
+ }
+
+ if (strlen($parts['path']) > 6 && substr($parts['path'], 0, 6) ===
'/wiki/') {
+ return true;
+ }
+
+ if (empty($parts['query'])) {
+ return false;
+ }
+
+ parse_str($parts['query'], $query);
+ return !empty($query['title']);
+ }
}
diff --git a/tests/unit/RelevanceScoring/Import/HtmlResultGetter.php
b/tests/unit/RelevanceScoring/Import/HtmlResultGetter.php
new file mode 100644
index 0000000..93dda2c
--- /dev/null
+++ b/tests/unit/RelevanceScoring/Import/HtmlResultGetter.php
@@ -0,0 +1,93 @@
+<?php
+
+namespace WikiMedia\RelevanceScoring\Import;
+
+class HtmlResultGetterTest extends \PHPUnit_Framework_TestCase
+{
+
+ public static function somethingProvider()
+ {
+ $selectors = [
+ 'is_valid' => 'body',
+ 'results' => 'li',
+ 'url' => 'a',
+ 'snippet' => 'p',
+ ];
+
+ $genHtml = function (array $results) {
+ $content = '';
+ foreach ($results as $url => $snippet) {
+ $content .= "<li><a href='$url'>some text</a>";
+ $content .= "<p>$snippet</p></li>";
+ }
+ return "<html><body><ul>$content</ul></body></html>";
+ };
+
+ return [
+ 'simple wiki article' => [
+ $selectors,
+
$genHtml(['https://test.wikipedia.org/wiki/Subject' => 'blah blah blah']),
+ // expected results
+ [new ImportedResult('unittest', 'Subject',
'blah blah blah', 0)]
+ ],
+ 'article in query string' => [
+ $selectors,
+
$genHtml(['https://test.wikipedia.org/w/index.php?title=Other' => 'foo bar
baz']),
+ [new ImportedResult('unittest', 'Other', 'foo
bar baz', 0)]
+ ],
+ 'multiple articles' => [
+ $selectors,
+ $genHtml([
+ 'https://test.wikipedia.org/wiki/Other'
=> 'foo bar baz',
+
'https://test.wikipedia.org/w/index.php?title=Thing' => 'bamboozle',
+ ]),
+ [
+ new ImportedResult('unittest', 'Other',
'foo bar baz', 0),
+ new ImportedResult('unittest', 'Thing',
'bamboozle', 1),
+ ]
+ ],
+ 'decodes entities' => [
+ $selectors,
+
$genHtml(['https://test.wikipedia.org/wiki/This_%26_That' => 'a > b']),
+ [new ImportedResult('unittest', 'This & That',
'a > b', 0)]
+ ],
+ 'ignores unexpected urls' => [
+ $selectors,
+ $genHtml([
+
'https://test.wikipedia.org/?search=stuff' => 'fofofofo',
+ 'https://not.us/wiki/Coffee' => 'tea',
+ 'https://test.wikipedia.org/wiki/' =>
'still wrong',
+ ]),
+ []
+ ],
+ ];
+ }
+
+ /**
+ * @dataProvider somethingProvider
+ */
+ public function testSomething(array $selectors, $html, $expected)
+ {
+ $client = $this->getMock('GuzzleHTTP\\Client');
+ $response =
$this->getMock('Psr\\Http\\Message\\ResponseInterface');
+ $response->expects($this->any())
+ ->method('getStatusCode')
+ ->will($this->returnValue(200));
+ $response->expects($this->any())
+ ->method('getBody')
+ ->will($this->returnValue($html));
+
+
+ $getter = new HtmlResultGetter(
+ $client,
+ ['testwiki' => 'https://test.wikipedia.org/w/api.php'],
+ 'unittest',
+ 'https://test.wikipedia.org/w/index.php',
+ $selectors,
+ []
+ );
+
+ $this->assertEquals($expected,
$getter->handleResponse($response, 'testwiki', ''));
+ }
+}
+
--
To view, visit https://gerrit.wikimedia.org/r/286758
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Ifadfc60cf96cbe6463457c217d72764c9d79749b
Gerrit-PatchSet: 1
Gerrit-Project: wikimedia/discovery/discernatron
Gerrit-Branch: master
Gerrit-Owner: EBernhardson <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits