Browse Source

Refactor Search

- Search engines specifications (search url, parameters, parsing paths) are now in included in a json file.
It should be easier to add or modify search engines without touching to the
code.
- Fixed type error in search.py.
master
octogene 5 years ago
parent
commit
6b2dd84eaa
3 changed files with 120 additions and 59 deletions
  1. +56
    -53
      hadaly/app.py
  2. +58
    -0
      hadaly/data/search_engines.json
  3. +6
    -6
      hadaly/search.py

+ 56
- 53
hadaly/app.py View File

@ -19,6 +19,10 @@ from PIL import Image
import tarfile
from kivy.config import Config
# Config.set('graphics', 'fullscreen', 'auto')
Config.set('kivy', 'log_level', 'debug')
from kivy.app import App
from kivy.core.window import Window
from kivy.properties import StringProperty, ListProperty, DictProperty, NumericProperty
@ -82,7 +86,8 @@ class HadalyApp(App):
def on_start(self):
self.tempdir = tempfile.mkdtemp()
self.presentation['title'] = _('New Title')
self.engines = json.load(open('hadaly/data/search_engines.json'))
try:
if argv[1].endswith('.opah'):
self.load_slides(os.path.dirname(argv[1]), [os.path.basename(argv[1])])
@ -417,80 +422,78 @@ class HadalyApp(App):
def search_term(self, term, engine, page):
engines = {
'met': ('http://www.metmuseum.org/'
'collection/the-collection-online/search'
'?ft={term}&ao=on&rpp={rpp}&pg={page}'),
'getty': ('http://search.getty.edu/'
'gateway/search'
'?q={term}&cat=highlight&f="Open+Content+Images"&rows={rpp}&srt=&dir=s&dsp=0&img=0&pg={page}')
}
params = urllib.parse.urlencode({self.engines[engine]['params']['term']: term,
self.engines[engine]['params']['rpp']: self.config.get('search', 'search_rpp'),
self.engines[engine]['params']['page']: page})
url = engines[engine].format(term=term,
rpp=self.config.get('search', 'search_rpp'),
page=page)
clean_url = urllib.parse.quote(url, safe="%/:=&?~#+!$,;'@()*[]")
url = ''.join((self.engines[engine]['base_url'], params))
UrlRequest(clean_url, on_success=self.parse_results, debug=True)
UrlRequest(url, on_success=self.parse_results, debug=True)
def parse_results(self, request, data):
tree = html.fromstring(data)
search_screen = self.root.get_screen('search')
results = []
if 'metmuseum' in urllib.parse.urlparse(request.url).hostname:
if self.engines[urllib.parse.urlparse(request.url).hostname]['results']['format'] == 'html':
tree = html.fromstring(data)
try:
total_pages = tree.xpath(
'//li[starts-with(@id, "phcontent_0_phfullwidthcontent_0_paginationWidget_rptPagination_paginationLineItem_")]//a/text()')
search_screen.box.total_pages = max([int(n) for n in total_pages])
total_results = re.sub("[^0-9]", "", tree.xpath(self.engines[urllib.parse.urlparse(request.url).hostname]['results']['total_results'])[0])
if not total_results or int(total_results) == 0:
raise ValueError
except ValueError:
self.show_popup(_('Error'), _('No results found.'))
return
objects = tree.xpath('//div[starts-with(@class, "list-view-object ")]')
for object in objects:
result = {}
result['title'] = object.xpath('./div[@class="list-view-object-info"]/a/div[@class="objtitle"]/text()')[
0]
try:
result['artist'] = \
object.xpath('./div[@class="list-view-object-info"]/div[@class="artist"]/text()')[::2][0]
except IndexError:
result['artist'] = 'Unknown'
result['thumb'] = urllib.parse.quote(object.xpath('./div[@class="list-view-thumbnail"]//img/@src')[0],
safe="%/:=&?~#+!$,;'@()*[]")
object_info = object.xpath('./div[@class="list-view-object-info"]/div[@class="objectinfo"]/text()')
result['year'] = object_info[0].strip('Dates: ')
results.append(result)
elif 'getty' in urllib.parse.urlparse(request.url).hostname:
if isinstance(tree.xpath(self.engines[urllib.parse.urlparse(request.url).hostname]['results']['total_pages']), list):
search_screen.box.total_pages = tree.xpath(self.engines[urllib.parse.urlparse(request.url).hostname]['results']['total_pages'])[0]
else:
search_screen.box.total_pages = tree.xpath(self.engines[urllib.parse.urlparse(request.url).hostname]['results']['total_pages'])
for entry in tree.xpath(self.engines[urllib.parse.urlparse(request.url).hostname]['results']['entries']):
artist = entry.xpath(self.engines[urllib.parse.urlparse(request.url).hostname]['results']['artist'])[0]
title = entry.xpath(self.engines[urllib.parse.urlparse(request.url).hostname]['results']['title'])[0]
date = entry.xpath(self.engines[urllib.parse.urlparse(request.url).hostname]['results']['date'])[0]
thumb = entry.xpath(self.engines[urllib.parse.urlparse(request.url).hostname]['results']['thumb'])[0]
obj_link = entry.xpath(self.engines[urllib.parse.urlparse(request.url).hostname]['results']['obj_link'])[0]
results.append({'title': title,
'artist': artist,
'year': date,
'thumb': urllib.parse.quote(thumb, safe="%/:=&?~#+!$,;'@()*[]"),
'obj_link': obj_link}
)
elif self.engines[urllib.parse.urlparse(request.url).hostname]['results']['format'] == 'json':
from ast import literal_eval
from functools import reduce
try:
search_screen.box.total_pages = tree.xpath('//td[@class="cs-page"]//input/@count')[0]
if int(re.sub("[^0-9]", "", tree.xpath('//strong[@id="cs-results-count"]//text()')[0])) == 0:
total_results = reduce(dict.__getitem__, literal_eval(self.engines[urllib.parse.urlparse(request.url).hostname]['results']['total_results']), data)
if int(total_results) == 0:
raise ValueError
except ValueError:
self.show_popup(_('Error'), _('No results found.'))
return
artists = tree.xpath(
'//div[@class="cs-result-data-brief"]//td[* = "Creator:" or * = "Maker Name:"]/following-sibling::td[1]/p/text()')
titles = tree.xpath(
'//div[@class="cs-result-data-brief"]//td[* = "Title:" or * = "Primary Title:"]/following-sibling::td[1]/p[@class="cs-record-link"]/a/strong/text()')
dates = tree.xpath(
'//div[@class="cs-result-data-brief"]//td[* = "Date:"]/following-sibling::td[1]/p/text()')
thumb = tree.xpath('//img[@class="cs-result-thumbnail"]/@src')
obj_link = tree.xpath(
'//div[@class="cs-result-data-brief"]//td[* = "Title:" or * = "Primary Title:"]/following-sibling::td[1]/p[@class="cs-record-link"]/a/@href')
entries = reduce(dict.__getitem__, literal_eval(self.engines[urllib.parse.urlparse(request.url).hostname]['results']['entries']), data)
search_screen.box.total_pages = int(total_results / len(entries))
for entry in entries:
artist = reduce(dict.__getitem__, literal_eval(self.engines[urllib.parse.urlparse(request.url).hostname]['results']['artist']), entry)
title = reduce(dict.__getitem__, literal_eval(self.engines[urllib.parse.urlparse(request.url).hostname]['results']['title']), entry)
date = reduce(dict.__getitem__, literal_eval(self.engines[urllib.parse.urlparse(request.url).hostname]['results']['date']), entry)
thumb = reduce(dict.__getitem__, literal_eval(self.engines[urllib.parse.urlparse(request.url).hostname]['results']['thumb']), entry)
obj_link = reduce(dict.__getitem__, literal_eval(self.engines[urllib.parse.urlparse(request.url).hostname]['results']['obj_link']), entry)
results = [{'title': a, 'artist': b, 'year': c, 'thumb': urllib.parse.quote(d, safe="%/:=&?~#+!$,;'@()*[]"),
'obj_link': e}
for a, b, c, d, e in zip(titles, artists, dates, thumb, obj_link)]
results.append({'title': title,
'artist': artist,
'year': date,
'thumb': urllib.parse.quote(thumb, safe="%/:=&?~#+!$,;'@()*[]"),
'obj_link': obj_link}
)
for photo in results:
Logger.debug('Search (MET): Loading {url}'.format(url=photo['thumb']))
Logger.debug('Search : Loading {url}'.format(url=photo['thumb']))
image = ItemButton(photo=photo, source=photo['thumb'], keep_ratio=True)
search_screen.box.grid.add_widget(image)


+ 58
- 0
hadaly/data/search_engines.json View File

@ -0,0 +1,58 @@
{
"www.metmuseum.org": {
"base_url": "http://www.metmuseum.org/api/collection/collectionlisting?showOnly=withImage&",
"params": {
"rpp": "perPage",
"artist": "artist",
"page": "page",
"term": "q",
"sort": "sortBy",
"sortings": {
"relevance": "Relevance"
},
"order": "sortOrder",
"orderings": {
"asc": "asc"
}
},
"results": {
"format": "json",
"entries": "('results',)",
"total_results": "('totalResults',)",
"title": "('title',)",
"artist": "('description',)",
"date": "('date',)",
"thumb": "('image',)",
"obj_link": "('largeImage',)"
}
},
"search.getty.edu": {
"base_url": "http://search.getty.edu/gateway/search?cat=highlight&f=\"Open+Content+Images\"&dir=s&img=1&dsp=0&",
"params": {
"rpp": "rows",
"artist": "artist",
"page": "pg",
"term": "q",
"sort": "sortBy",
"sortings": {
"relevance": "Relevance"
},
"order": "sortOrder",
"orderings": {
"asc": "asc"
}
},
"results": {
"format": "html",
"entries": "//div[@class=\"cs-result-item\"]",
"title": ".//div[@class=\"cs-result-data-brief\"]//td[* = \"Title:\" or * = \"Primary Title:\"]/following-sibling::td[1]/p[@class=\"cs-record-link\"]/a/strong/text()",
"artist": ".//div[@class=\"cs-result-data-brief\"]//td[* = \"Creator:\" or * = \"Maker Name:\"]/following-sibling::td[1]/p/text()",
"date": ".//div[@class=\"cs-result-data-brief\"]//td[* = \"Date:\"]/following-sibling::td[1]/p/text()",
"thumb": ".//img[@class=\"cs-result-thumbnail\"]/@src",
"obj_link": ".//div[@class=\"cs-result-data-brief\"]//td[* = \"Title:\" or * = \"Primary Title:\"]/following-sibling::td[1]/p[@class=\"cs-record-link\"]/a/@href",
"total_pages": "//td[@class=\"cs-page\"]//input/@count",
"total_results": "//strong[@id=\"cs-results-count\"]//text()"
}
}
}

+ 6
- 6
hadaly/search.py View File

@ -32,8 +32,8 @@ class SearchBox(BoxLayout):
Logger.debug('Search: requesting {term} from {provider}'.format(term=term,
provider=provider))
self.grid.clear_widgets()
providers = {'MetMuseum': 'met',
'Getty OCI': 'getty'}
providers = {'MetMuseum': 'www.metmuseum.org',
'Getty OCI': 'search.getty.edu'}
if not term:
popup = Popup(title=_('Error'), size_hint=(0.3, 0.2))
popup.add_widget(Label(text=_('Please enter a search term.')))
@ -58,7 +58,7 @@ class SearchBox(BoxLayout):
self.app.search_term(term, providers[provider], self.current_page)
def search_next(self, text, provider):
if self.current_page < self.total_pages:
if self.current_page < int(self.total_pages):
self.current_page += 1
self.search(text, provider)
@ -105,7 +105,7 @@ class SearchItemInfo(Popup):
if self.provider == 'MET':
# Check if high-res is available.
url = self.photo['thumb'].replace('web-thumb', 'original')
url = self.photo['thumb'].replace('mobile-large', 'original')
req = urllib.request.urlopen(url)
if req.getcode() == 404:
Logger.debug('Search: High-res image not available.')
@ -120,9 +120,9 @@ class SearchItemInfo(Popup):
req = urllib.request.urlopen(self.photo['obj_link'])
tree = html.parse(req)
img_link = tree.xpath('//div[@class="cs-result-image"]//span[@class="nav"]/a/@href')
img_link = tree.xpath('//a[@id="download-open-content"]/@href')
if img_link:
link = urllib.parse.urlparse(tree.xpath('//div[@class="cs-result-image"]//span[@class="nav"]/a/@href')[0])
link = urllib.parse.urlparse(img_link[0])
url = urllib.parse.parse_qs(link.query)['dlimgurl'][0]
else:
url = self.photo['thumb']


Loading…
Cancel
Save