devicehub-teal/ereuse_devicehub/resources/device/search.py

from itertools import chain

import inflection
from sqlalchemy.dialects import postgresql
from sqlalchemy.dialects.postgresql import TSVECTOR
from sqlalchemy.orm import aliased

from ereuse_devicehub.db import db
from ereuse_devicehub.resources import search
from ereuse_devicehub.resources.action.models import Action, ActionWithMultipleDevices, \
    ActionWithOneDevice
from ereuse_devicehub.resources.agent.models import Organization
from ereuse_devicehub.resources.device.models import Component, Computer, Device
from ereuse_devicehub.resources.tag.model import Tag


class DeviceSearch(db.Model):
    """Temporary table that stores full-text device documents.

    It provides methods to auto-run
    """
    device_id = db.Column(db.BigInteger,
                          db.ForeignKey(Device.id, ondelete='CASCADE'),
                          primary_key=True)
    device = db.relationship(Device, primaryjoin=Device.id == device_id)

    properties = db.Column(TSVECTOR, nullable=False)
    tags = db.Column(TSVECTOR)

    __table_args__ = (
        # todo to add concurrency this should be commited separately
        #   see https://docs.sqlalchemy.org/en/latest/dialects/postgresql.html#indexes-with-concurrently
        db.Index('properties gist', properties, postgresql_using='gist'),
        db.Index('tags gist', tags, postgresql_using='gist'),
        {
            'prefixes': ['UNLOGGED']
            # Only for temporal tables, can cause table to empty on turn on
        }

    )

    @classmethod
    def update_modified_devices(cls, session: db.Session):
        """Updates the documents of the devices that are part of a
        modified action, or tag in the passed-in session.

        This method is registered as a SQLAlchemy listener in the
        Devicehub class.
        """
        devices_to_update = set()
        for model in chain(session.new, session.dirty):
            if isinstance(model, Action):
                if isinstance(model, ActionWithMultipleDevices):
                    devices_to_update |= model.devices
                elif isinstance(model, ActionWithOneDevice):
                    devices_to_update.add(model.device)
                if model.parent:
                    devices_to_update.add(model.parent)
                devices_to_update |= model.components
            elif isinstance(model, Tag) and model.device:
                devices_to_update.add(model.device)

        # this flush is controversial:
        # see https://groups.google.com/forum/#!topic/sqlalchemy/hBzfypgPfYo
        # todo probably should replace it with what the solution says
        session.flush()
        for device in (d for d in devices_to_update if not isinstance(d, Component)):
            cls.set_device_tokens(session, device)

    @classmethod
    def set_all_devices_tokens_if_empty(cls, session: db.Session):
        """Generates the search docs if the table is empty.

        This can happen if Postgres' shut down unexpectedly, as
        it deletes unlogged tables as ours.
        """
        if not DeviceSearch.query.first():
            cls.regenerate_search_table(session)

    @classmethod
    def regenerate_search_table(cls, session: db.Session):
        """Deletes and re-computes all the search table."""
        DeviceSearch.query.delete()
        for device in Device.query:
            if not isinstance(device, Component):
                cls.set_device_tokens(session, device)

    @classmethod
    def set_device_tokens(cls, session: db.Session, device: Device):
        """(Re)Generates the device search tokens."""
        assert not isinstance(device, Component)

        tokens = [
            (str(device.id), search.Weight.A),
            (inflection.humanize(device.type), search.Weight.B),
            (Device.model, search.Weight.B),
            (Device.manufacturer, search.Weight.C),
            (Device.serial_number, search.Weight.A)
        ]

        if device.manufacturer:
            # todo this has to be done using a dictionary
            manufacturer = device.manufacturer.lower()
            if 'asus' in manufacturer:
                tokens.append(('asus', search.Weight.B))
            if 'hewlett' in manufacturer or 'hp' in manufacturer or 'h.p' in manufacturer:
                tokens.append(('hp', search.Weight.B))
                tokens.append(('h.p', search.Weight.C))
                tokens.append(('hewlett', search.Weight.C))
                tokens.append(('packard', search.Weight.C))

        if isinstance(device, Computer):
            # Aggregate the values of all the components of pc
            Comp = aliased(Component)
            tokens.extend((
                (db.func.string_agg(db.cast(Comp.id, db.TEXT), ' '), search.Weight.D),
                (db.func.string_agg(Comp.model, ' '), search.Weight.C),
                (db.func.string_agg(Comp.manufacturer, ' '), search.Weight.D),
                (db.func.string_agg(Comp.serial_number, ' '), search.Weight.B),
                (db.func.string_agg(Comp.type, ' '), search.Weight.B),
                ('Computer', search.Weight.C),
                ('PC', search.Weight.C),
            ))

        properties = session \
            .query(search.Search.vectorize(*tokens)) \
            .filter(Device.id == device.id)

        if isinstance(device, Computer):
            # Join to components
            properties = properties \
                .outerjoin(Comp, Computer.components) \
                .group_by(Device.id)

        tags = session.query(
            search.Search.vectorize(
                (db.func.string_agg(Tag.id, ' '), search.Weight.A),
                (db.func.string_agg(Tag.secondary, ' '), search.Weight.A),
                (db.func.string_agg(Organization.name, ' '), search.Weight.B)
            )
        ).filter(Tag.device_id == device.id).join(Tag.org)

        # Note that commit flushes later
        # todo see how to get rid of the one_or_none() by embedding those as subqueries
        # I don't like this but I want the 'on_conflict_on_update' thingie
        device_document = dict(properties=properties.one_or_none(), tags=tags.one_or_none())
        insert = postgresql.insert(DeviceSearch.__table__) \
            .values(device_id=device.id, **device_document) \
            .on_conflict_do_update(constraint='device_search_pkey', set_=device_document)
        session.execute(insert)
Fix tags not added to device search 2018-10-18 08:09:10 +00:00			`from itertools import chain`

Add DeviceSearch 2018-09-29 10:24:22 +00:00			`import inflection`
			`from sqlalchemy.dialects import postgresql`
			`from sqlalchemy.dialects.postgresql import TSVECTOR`
			`from sqlalchemy.orm import aliased`

			`from ereuse_devicehub.db import db`
			`from ereuse_devicehub.resources import search`
Change event to action complying with schema.org 2019-05-11 14:27:22 +00:00			`from ereuse_devicehub.resources.action.models import Action, ActionWithMultipleDevices, \`
			`ActionWithOneDevice`
Add DeviceSearch 2018-09-29 10:24:22 +00:00			`from ereuse_devicehub.resources.agent.models import Organization`
			`from ereuse_devicehub.resources.device.models import Component, Computer, Device`
			`from ereuse_devicehub.resources.tag.model import Tag`


			`class DeviceSearch(db.Model):`
			`"""Temporary table that stores full-text device documents.`

			`It provides methods to auto-run`
			`"""`
			`device_id = db.Column(db.BigInteger,`
			`db.ForeignKey(Device.id, ondelete='CASCADE'),`
			`primary_key=True)`
			`device = db.relationship(Device, primaryjoin=Device.id == device_id)`

Use Postgres 11; enhance search query with websearch; really use indexes, add hash index; fix partially patching lots 2019-02-07 12:47:42 +00:00			`properties = db.Column(TSVECTOR, nullable=False)`
			`tags = db.Column(TSVECTOR)`

			`__table_args__ = (`
			`# todo to add concurrency this should be commited separately`
			`# see https://docs.sqlalchemy.org/en/latest/dialects/postgresql.html#indexes-with-concurrently`
			`db.Index('properties gist', properties, postgresql_using='gist'),`
			`db.Index('tags gist', tags, postgresql_using='gist'),`
			`{`
			`'prefixes': ['UNLOGGED']`
			`# Only for temporal tables, can cause table to empty on turn on`
			`}`

			`)`
Add DeviceSearch 2018-09-29 10:24:22 +00:00
			`@classmethod`
			`def update_modified_devices(cls, session: db.Session):`
Fix tags not added to device search 2018-10-18 08:09:10 +00:00			`"""Updates the documents of the devices that are part of a`
Change event to action complying with schema.org 2019-05-11 14:27:22 +00:00			`modified action, or tag in the passed-in session.`
Add DeviceSearch 2018-09-29 10:24:22 +00:00
Fix tags not added to device search 2018-10-18 08:09:10 +00:00			`This method is registered as a SQLAlchemy listener in the`
			`Devicehub class.`
Add DeviceSearch 2018-09-29 10:24:22 +00:00			`"""`
			`devices_to_update = set()`
Fix tags not added to device search 2018-10-18 08:09:10 +00:00			`for model in chain(session.new, session.dirty):`
Change event to action complying with schema.org 2019-05-11 14:27:22 +00:00			`if isinstance(model, Action):`
			`if isinstance(model, ActionWithMultipleDevices):`
Fix tags not added to device search 2018-10-18 08:09:10 +00:00			`devices_to_update \|= model.devices`
Change event to action complying with schema.org 2019-05-11 14:27:22 +00:00			`elif isinstance(model, ActionWithOneDevice):`
Fix tags not added to device search 2018-10-18 08:09:10 +00:00			`devices_to_update.add(model.device)`
			`if model.parent:`
			`devices_to_update.add(model.parent)`
			`devices_to_update \|= model.components`
			`elif isinstance(model, Tag) and model.device:`
			`devices_to_update.add(model.device)`
Add DeviceSearch 2018-09-29 10:24:22 +00:00
			`# this flush is controversial:`
			`# see https://groups.google.com/forum/#!topic/sqlalchemy/hBzfypgPfYo`
			`# todo probably should replace it with what the solution says`
			`session.flush()`
			`for device in (d for d in devices_to_update if not isinstance(d, Component)):`
			`cls.set_device_tokens(session, device)`

			`@classmethod`
			`def set_all_devices_tokens_if_empty(cls, session: db.Session):`
			`"""Generates the search docs if the table is empty.`

			`This can happen if Postgres' shut down unexpectedly, as`
			`it deletes unlogged tables as ours.`
			`"""`
			`if not DeviceSearch.query.first():`
Add regenerate-search 2018-10-31 11:27:16 +00:00			`cls.regenerate_search_table(session)`

			`@classmethod`
			`def regenerate_search_table(cls, session: db.Session):`
			`"""Deletes and re-computes all the search table."""`
			`DeviceSearch.query.delete()`
			`for device in Device.query:`
			`if not isinstance(device, Component):`
			`cls.set_device_tokens(session, device)`
Add DeviceSearch 2018-09-29 10:24:22 +00:00
			`@classmethod`
			`def set_device_tokens(cls, session: db.Session, device: Device):`
			`"""(Re)Generates the device search tokens."""`
			`assert not isinstance(device, Component)`

			`tokens = [`
Fix searching for IDs, numbers, HP, and Asus 2018-10-31 10:40:45 +00:00			`(str(device.id), search.Weight.A),`
Add DeviceSearch 2018-09-29 10:24:22 +00:00			`(inflection.humanize(device.type), search.Weight.B),`
			`(Device.model, search.Weight.B),`
			`(Device.manufacturer, search.Weight.C),`
			`(Device.serial_number, search.Weight.A)`
			`]`
Fix searching for IDs, numbers, HP, and Asus 2018-10-31 10:40:45 +00:00
			`if device.manufacturer:`
			`# todo this has to be done using a dictionary`
			`manufacturer = device.manufacturer.lower()`
			`if 'asus' in manufacturer:`
			`tokens.append(('asus', search.Weight.B))`
			`if 'hewlett' in manufacturer or 'hp' in manufacturer or 'h.p' in manufacturer:`
			`tokens.append(('hp', search.Weight.B))`
			`tokens.append(('h.p', search.Weight.C))`
			`tokens.append(('hewlett', search.Weight.C))`
			`tokens.append(('packard', search.Weight.C))`

Add DeviceSearch 2018-09-29 10:24:22 +00:00			`if isinstance(device, Computer):`
Fix searching for IDs, numbers, HP, and Asus 2018-10-31 10:40:45 +00:00			`# Aggregate the values of all the components of pc`
Add DeviceSearch 2018-09-29 10:24:22 +00:00			`Comp = aliased(Component)`
simplify token in search 2020-12-17 10:35:55 +00:00			`tokens.extend((`
			`(db.func.string_agg(db.cast(Comp.id, db.TEXT), ' '), search.Weight.D),`
			`(db.func.string_agg(Comp.model, ' '), search.Weight.C),`
			`(db.func.string_agg(Comp.manufacturer, ' '), search.Weight.D),`
			`(db.func.string_agg(Comp.serial_number, ' '), search.Weight.B),`
			`(db.func.string_agg(Comp.type, ' '), search.Weight.B),`
			`('Computer', search.Weight.C),`
			`('PC', search.Weight.C),`
			`))`
Add DeviceSearch 2018-09-29 10:24:22 +00:00
			`properties = session \`
			`.query(search.Search.vectorize(*tokens)) \`
			`.filter(Device.id == device.id)`

			`if isinstance(device, Computer):`
			`# Join to components`
			`properties = properties \`
			`.outerjoin(Comp, Computer.components) \`
			`.group_by(Device.id)`

			`tags = session.query(`
			`search.Search.vectorize(`
			`(db.func.string_agg(Tag.id, ' '), search.Weight.A),`
Fix not adding tag.secondary to search 2018-10-15 11:12:13 +00:00			`(db.func.string_agg(Tag.secondary, ' '), search.Weight.A),`
Add DeviceSearch 2018-09-29 10:24:22 +00:00			`(db.func.string_agg(Organization.name, ' '), search.Weight.B)`
			`)`
			`).filter(Tag.device_id == device.id).join(Tag.org)`

			`# Note that commit flushes later`
			`# todo see how to get rid of the one_or_none() by embedding those as subqueries`
			`# I don't like this but I want the 'on_conflict_on_update' thingie`
			`device_document = dict(properties=properties.one_or_none(), tags=tags.one_or_none())`
			`insert = postgresql.insert(DeviceSearch.__table__) \`
			`.values(device_id=device.id, **device_document) \`
			`.on_conflict_do_update(constraint='device_search_pkey', set_=device_document)`
			`session.execute(insert)`