Amazon CloudSearch spike project
Gist: https://gist.github.com/nanvel/4f7696174ac3a9b3554c
""" Search bebop series. """ import arrow import json from tornado import options from tornado.httpclient import HTTPError, HTTPClient, HTTPRequest from tornado_botocore import Botocore from tvs import TVS DOMAIN_NAME = 'test-bebop-domain' API_VERSION = '2013-01-01' if __name__ == '__main__': options.parse_command_line() # create domain cs_create_domain = Botocore( service='cloudsearch', operation='CreateDomain', region_name='us-west-2') session = cs_create_domain.session try: # create domain, domain will be reused if already exists print cs_create_domain.call(domain_name=DOMAIN_NAME) # { # "DomainStatus":{ # "DomainId":"240020657974/test-bebop-domain", # "Created":true, # "SearchService":{}, # "SearchInstanceCount":0, # "DomainName":"test-bebop-domain", # "DocService":{}, # "Deleted":false, # "Processing":false, # "RequiresIndexDocuments":false, # "ARN":"arn:aws:cloudsearch:us-west-2:240020657974:domain/test-bebop-domain", # "SearchPartitionCount":0 # }, # "ResponseMetadata":{ # "RequestId":"38b0cba7-60f2-11e4-980e-6d6976ea3108" # } # } except HTTPError as e: print e.response.body # configure fields cs_define_index_field = Botocore( service='cloudsearch', operation='DefineIndexField', region_name='us-west-2', session=session) # Fields: # - title - text + show in result # - airdate - uint # - genre - literal + facet enabled (or literal-array?) # - content - text FIELDS = [{ 'DomainName': DOMAIN_NAME, 'IndexField': { 'IndexFieldName': 'title', 'IndexFieldType': 'text', 'TextOptions': { 'HighlightEnabled': False, 'DefaultValue': 'untitled', 'ReturnEnabled': True, } } }, { 'DomainName': DOMAIN_NAME, 'IndexField': { 'IndexFieldName': 'content', 'IndexFieldType': 'text', 'TextOptions': { 'HighlightEnabled': False, 'DefaultValue': '', 'ReturnEnabled': False, } } }, { 'DomainName': DOMAIN_NAME, 'IndexField': { 'IndexFieldName': 'airdate', 'IndexFieldType': 'int', 'IntOptions': { 'DefaultValue': 946684800, } } }, { 'DomainName': DOMAIN_NAME, 'IndexField': { 'IndexFieldName': 'genre', 'IndexFieldType': 'literal-array', 'LiteralArrayOptions': { 'DefaultValue': '', 'FacetEnabled': True, 'ReturnEnabled': False, 'SearchEnabled': True, } } }] try: for params in FIELDS: print cs_define_index_field.call(**params) except HTTPError as e: print e.response.body # add data batch = [] for tv in TVS: batch.append({ 'type': 'add', 'id': tv['number'], 'fields': { 'title': tv['title'], 'content': tv['content'], 'airdate': arrow.get(tv['airdate'], ['YYYY-MM-DD', 'MMMM D, YYYY']).timestamp, 'genre': tv['genre'], } }) # get document and search endpoints cs_describe_domains = Botocore( service='cloudsearch', operation='DescribeDomains', region_name='us-west-2', session=session) response = cs_describe_domains.call(domain_names=[DOMAIN_NAME]) # { # "DomainStatusList":[ # { # "DomainId":"240020657974/test-bebop-domain", # "Created":true, # "SearchService":{ # "Endpoint":"search-test-bebop-domain-kmvxd5zzot4opij6zvb6okvrma.us-west-2.cloudsearch.amazonaws.com" # }, # "SearchInstanceCount":1, # "DomainName":"test-bebop-domain", # "DocService":{ # "Endpoint":"doc-test-bebop-domain-kmvxd5zzot4opij6zvb6okvrma.us-west-2.cloudsearch.amazonaws.com" # }, # "SearchInstanceType":"search.m1.small", # "Deleted":false, # "Processing":false, # "RequiresIndexDocuments":true, # "ARN":"arn:aws:cloudsearch:us-west-2:240020657974:domain/test-bebop-domain", # "SearchPartitionCount":1 # } # ], # "ResponseMetadata":{ # "RequestId":"7993ac9b-6101-11e4-8510-8ffcccb94f21" # } # } search_endpoint = response['DomainStatusList'][0]['SearchService']['Endpoint'] document_endpoint = response['DomainStatusList'][0]['DocService']['Endpoint'] httpclient = HTTPClient() # reindex cs_index_documents = Botocore( service='cloudsearch', operation='IndexDocuments', region_name='us-west-2', session=session) print cs_index_documents.call(domain_name=DOMAIN_NAME) # wait unil reindex complete # add documents url = 'http://{document_endpoint}/{api_version}/documents/batch'.format( document_endpoint=document_endpoint, api_version=API_VERSION) try: request = HTTPRequest( url=url, body=json.dumps(batch), headers={'Content-Type': 'application/json'}, method='POST') request.params = None cs_describe_domains.endpoint.auth.add_auth(request=request) response = httpclient.fetch(request=request) print response.body except HTTPError as e: print e.response.body # search url = 'http://{search_endpoint}/{api_version}/search?q=bebop'.format( search_endpoint=search_endpoint, api_version=API_VERSION) request = HTTPRequest( url=url, headers={'Content-Type': 'application/json'}, method='GET') request.params = None cs_describe_domains.endpoint.auth.add_auth(request=request) response = httpclient.fetch(request=request) print response.body # { # "status":{ # "rid":"st/UtJYpAAoghec=", # "time-ms":82 # }, # "hits":{ # "found":12, # "start":0, # "hit":[ # { # "id":"3", # "fields":{ # "airdate":"910396800", # "title":"Honky Tonk Women" # } # }, # { # "id":"18", # "fields":{ # "airdate":"920073600", # "title":"Speak Like a Child" # } # }, # ... # ] # } # }
Licensed under CC BY-SA 3.0