0

This is my program where when sending an input string of text, the response is each word and how many times it occurs (returned as JSON objects):

from swagger_server.models.result import Result  # noqa: E501


def get_concordance(body):  # noqa: E501
    """Calculate

    Post text to generate concordance # noqa: E501

    :param body: Text to be analyzed
    :type body: dict | bytes

    :rtype: Result
    """

    input_text = body.decode('utf-8')

    split_string = (input_text.split())

    def word_count():
        # counts = dict()
        counts = {}

        for word in split_string:
            if word in counts:
                counts[word] += 1
            else:
                counts[word] = 1

        return counts

    try:
        response = {
            "concordance": [
                {
                    "count": word_count(),
                    "token": split_string
                }
            ],
            "input": input_text
        }
    except Exception as error:
        response = {
            "error": repr(error)
        }
    return response

Right now the JSON output looks like:

{
  "concordance": [
    {
      "count": {
        "The": 1,
        "brown": 2,
        "fox": 1,
        "jumped": 1,
        "log.": 1,
        "over": 1,
        "the": 1
      },
      "token": [
        "The",
        "brown",
        "fox",
        "jumped",
        "over",
        "the",
        "brown",
        "log."
      ]
    }
  ],
  "input": "The brown fox jumped over the brown log."
}

However, I am trying to have the output formatted to look like:

{
  "concordance": [
    {
      "token": "brown",
      "count": 2
    },
    {
      "token": "fox",
      "count": 1
    },
    {
      "token": "jumped",
      "count": 1
    },
    {
      "token": "log",
      "count": 1
    },
    {
      "token": "over",
      "count": 1
    },
    {
      "token": "the",
      "count": 1
    }
  ],
  "input": "The brown fox jumped over the brown log."
}

Does anyone know how I can change my code so that it prints it correctly? I am not sure how I can separate each word in the list and associate it with its count. Thank you.

3
  • I posted the actual output first, and the desired output second. Commented Sep 23, 2020 at 20:17
  • can you post the sample input Commented Sep 23, 2020 at 20:18
  • sample input is The brown fox jumped over the brown log. Commented Sep 23, 2020 at 20:19

2 Answers 2

1

Here's a relatively simple way to do it that uses the collections.Counter class to simplify things a little bit (and gets rid of the nested-function):

import collections
from pprint import pprint
#from swagger_server.models.result import Result  # noqa: E501


def get_concordance(body):  # noqa: E501
    """Calculate

    Post text to generate concordance # noqa: E501

    :param body: Text to be analyzed
    :type body: dict | bytes

    :rtype: Result
    """

    input_text = body.decode('utf-8')
    split_string = input_text.split()
    word_counts = collections.Counter(split_string)

    pairs = [{'token': token, 'count': count}
                for token, count in zip(split_string, word_counts.values())]
    response = {
        "concordance": pairs,
        "input": input_text
    }
    return response



if __name__ == '__main__':

    body = b'The brown fox jumped over the log'
    resp = get_concordance(body)
    pprint(resp, sort_dicts=False)

Output:

{'concordance': [{'token': 'The', 'count': 1},
                 {'token': 'brown', 'count': 1},
                 {'token': 'fox', 'count': 1},
                 {'token': 'jumped', 'count': 1},
                 {'token': 'over', 'count': 1},
                 {'token': 'the', 'count': 1},
                 {'token': 'log', 'count': 1}],
 'input': 'The brown fox jumped over the log'}
Sign up to request clarification or add additional context in comments.

Comments

0

Construct a list of your token-count dictionaries in a list comprehension.

def get_concordance(body):  # noqa: E501
    # Documentation omitted for brevity
    input_text = body.decode('utf-8')
    split_string = input_text.split()

    def word_count():
        # counts = dict()
        counts = {}

        for word in split_string:
            if word in counts:
                counts[word] += 1
            else:
                counts[word] = 1

        return counts
    
    try:
        # {"the": 2, "brown": 1}
        counts = word_count()
        # This is the changed part.
        concordance = [{"count": counts[token], "token": token} for token in counts.keys()]
        response = {
            "concordance": concordance,
            "input": input_text,
        }
    except Exception as error:
        response = {
            "error": repr(error)
        }
    return response
inputs = b"The brown fox jumped over the brown log"
get_concordance(inputs)

{'concordance': [{'count': 1, 'token': 'The'},
  {'count': 2, 'token': 'brown'},
  {'count': 1, 'token': 'fox'},
  {'count': 1, 'token': 'jumped'},
  {'count': 1, 'token': 'over'},
  {'count': 1, 'token': 'the'},
  {'count': 1, 'token': 'log'}],
 'input': 'The brown fox jumped over the brown log'}

4 Comments

The output format is correct, however, it is returning repeat words (i.e. "brown"). I'd like to have the word "brown" listed once, with its current count.
Current output (with duplicate "brown"): { "concordance": [ { "count": 1, "token": "The" }, { "count": 2, "token": "brown" }, { "count": 1, "token": "fox" }, { "count": 1, "token": "jumped" }, { "count": 1, "token": "over" }, { "count": 1, "token": "the" }, { "count": 2, "token": "brown" }, { "count": 1, "token": "log" } ], "input": "The brown fox jumped over the brown log" }
@HannahYoussef - fixed it by iterating over the counts dictionary instead of the split string.
Works perfectly. Thank you!

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.