OverviewΒΆ

Download it here and install it using pip:

>>> pip install squirro.lib.nlp-SQUIRRO_VERSION-py2.py3-none-any.whl

libNLP is structured as a pipeline where a user can specify a sequence of steps to load and transform unstructured data to then be classified, etc, and then ultimately saved either in Squirro or to disk (CSV or JSON format).

The pipeline configuration is specified in JSON format. For example:

    {
    "dataset": {
        "items": [{
          "id": "0",
          "label": ["fake"],
          "body": "<html><body><p>This is a fake Squirro Item. It is composed of a couple fake sentences.</p></body></html>"
        },{
          "id": "1",
          "label": ["not fake"],
          "body": "<html><body><p>This is not a fake Squirro Item. It is composed of a couple not fake sentences.</p></body></html>"
        },{
          "id": "2",
          "label": ["fake"],
          "body": "<html><body><p>This is a fake Squirro Item. It is composed of a couple fake sentences.</p></body></html>"
        },{
          "id": "3",
          "label": ["not fake"],
          "body": "<html><body><p>This is not a fake Squirro Item. It is composed of a couple not fake sentences.</p></body></html>"
        },{
          "id": "4",
          "label": ["fake"],
          "body": "<html><body><p>This is a fake Squirro Item. It is composed of a couple fake sentences.</p></body></html>"
        }]
    },
    "pipeline": [
        {
          "fields": [
            "body",
            "label"
          ],
          "step": "loader",
          "type": "squirro_item"
        },
        {
          "fields": [
            "body"
          ],
          "step": "filter",
          "type": "empty"
        },
        {
          "input_fields": [
            "extract_sentences"
          ],
          "output_fields": [
            "normalized_extract"
          ],
          "step": "normalizer",
          "type": "html"
        },
        {
          "fields": [
            "normalized_extract"
          ],
          "step": "normalizer",
          "type": "punctuation"
        },
        {
          "fields": [
            "normalized_extract"
          ],
          "mark_as_skipped": true,
          "step": "filter",
          "type": "regex",
          "whitelist_regexes": [
            "^.{20,}$"
          ]
        },
        {
          "step": "embedder",
          "type": "transformers",
          "transformer": "huggingface",
          "model_name": "https://tfhub.dev/google/universal-sentence-encoder/4",
          "input_field": "body",
          "output_field": "embedded_extract"
        },
        {
          "step": "randomizer",
          "type": "randomizer"
        },
        {
          "input_field": "embedded_extract",
          "label_field": "label",
          "output_field": "prediction",
          "step": "classifier",
          "type": "cosine_similarity"
        },
        {
          "step": "debugger",
          "type": "log_fields",
          "fields": [
            "extract_sentences",
            "prediction"
          ],
          "log_level": "warning"
        }
    ]
}