Most incessantly used phrases in a textual content with Python


The problem

Write a perform that, given a string of textual content (probably with punctuation and line-breaks), returns an array of the top-3 most occurring phrases, in descending order of the variety of occurrences.

Assumptions:

  • A phrase is a string of letters (A to Z) optionally containing a number of apostrophes (‘) in ASCII. (No have to deal with fancy punctuation.)
  • Matches ought to be case-insensitive, and the phrases within the outcome ought to be lowercased.
  • Ties could also be damaged arbitrarily.
  • If a textual content incorporates fewer than three distinctive phrases, then both the top-2 or top-1 phrases ought to be returned, or an empty array if a textual content incorporates no phrases.

Examples:

top_3_words("In a village of La Mancha, the identify of which I've no want to name to
thoughts, there lived not lengthy since a type of gents that maintain a lance
within the lance-rack, an outdated buckler, a lean hack, and a greyhound for
coursing. An olla of quite extra beef than mutton, a salad on most
nights, scraps on Saturdays, lentils on Fridays, and a pigeon or so additional
on Sundays, made away with three-quarters of his revenue.")
# => ["a", "of", "on"]

top_3_words("e e e e DDD ddd DdD: ddd ddd aa aA Aa, bb cc cC e e e")
# => ["e", "ddd", "aa"]

top_3_words("  //wont will not will not")
# => ["won't", "wont"]

Bonus factors:

  1. Keep away from creating an array whose reminiscence footprint is roughly as large because the enter textual content.
  2. Keep away from sorting the whole array of distinctive phrases.

Take a look at instances

from random import selection, randint, pattern, shuffle, decisions
import re
from collections import Counter


def verify(s, this=None):                                            # this: just for debugging function
    returned_result = top_3_words(s) if that is None else this
    fs = Counter(w for w in re.findall(r"[a-zA-Z']+", s.decrease()) if w != "'" * len(w))
    exp,expected_frequencies = map(record,zip(*fs.most_common(3))) if fs else ([],[])
    
    msg = ''
    wrong_words = [w for w in returned_result if not fs[w]]
    actual_freq = [fs[w] for w in returned_result]
    
    if wrong_words:
        msg = 'Incorrect match: phrases not current within the string. Your output: {}. One potential legitimate reply: {}'.format(returned_result, exp)
    elif len(set(returned_result)) != len(returned_result):
        msg = 'The outcome mustn't comprise copies of the identical phrase. Your output: {}. One potential output: {}'.format(returned_result, exp)
    elif actual_freq!=expected_frequencies:
        msg = "Incorrect frequencies: {} ought to be {}. Your output: {}. One potential output: {}".format(actual_freq, expected_frequencies, returned_result, exp)
    
    Take a look at.count on(not msg, msg)



@take a look at.describe("Mounted assessments")
def fixed_tests():

    TESTS = (
    "a a a  b  c c  d d d d  e e e e e",
    "e e e e DDD ddd DdD: ddd ddd aa aA Aa, bb cc cC e e e",
    "  //wont will not will not ",
    "  , e   .. ",
    "  ...  ",
    "  '  ",
    "  '''  ",
    """In a village of La Mancha, the identify of which I've no want to cao
    thoughts, there lived not lengthy since a type of gents that maintain a lance
    within the lance-rack, an outdated buckler, a lean hack, and a greyhound for
    coursing. An olla of quite extra beef than mutton, a salad on most
    nights, scraps on Saturdays, lentils on Fridays, and a pigeon or so additional
    on Sundays, made away with three-quarters of his revenue.""",
    "a a a  b  c c X",
    "a a c b b",
    )
    for s in TESTS: verify(s)
    
@take a look at.describe("Random assessments")
def random_tests():
    
    def gen_word():
        return "".be a part of(selection("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'") for _ in vary(randint(3, 10)))
    
    def gen_string():
        phrases = []
        nums = decisions(vary(1, 31), ok=20)
        for _ in vary(randint(0, 20)):
            phrases += [gen_word()] * nums.pop()
        shuffle(phrases)
        s = ""
        whereas phrases:
            s += phrases.pop() + "".be a part of(selection("-,.?!_:;/ ") for _ in vary(randint(1, 5)))
        return s
    
    @take a look at.it("Exams")
    def it_1():
        for _ in vary(100): verify(gen_string())
            

The answer utilizing Python

Choice 1:

# use the Counter module
from collections import Counter
# use the regex module
import re

def top_3_words(textual content):
    # rely the enter, move by way of a regex and lowercase it
    c = Counter(re.findall(r"[a-z']+", re.sub(r" '+ ", " ", textual content.decrease())))
    # return the `most typical` 3 objects
    return [w for w,_ in c.most_common(3)]

Choice 2:

def top_3_words(textual content):
    # loop by way of every character within the string
    for c in textual content:
        # if it is not alphanumeric or an apostrophe
        if not (c.isalpha() or c=="'"):
            # change with an area
            textual content = textual content.change(c,' ')
    # create some `record` variables
    phrases,counts,out = [],[],[]

    # loop by way of the phrases within the textual content
    for phrase in record(filter(None,textual content.decrease().break up())):
        # if in all, then proceed
        if all([not c.isalpha() for c in word]):
            proceed
        # if the phrase is within the phrases record
        if phrase in phrases:
            # increment the rely
            counts[words.index(word)] += 1
        else:
            # in any other case create a brand new entry
            phrases.append(phrase); counts.append(0)

    # loop whereas larger than 0 and fewer than 3
    whereas len(phrases)>0 and len(out)<3:
        # append the counts
        out.append(phrases.pop(counts.index(max(counts))).decrease())
        counts.take away(max(counts))
    # return the counts
    return out

Choice 3:

def top_3_words(textual content):
    wrds = {}
    for p in r'!"#$%&()*+,./:;<=>[email protected][]^_`~-':
        textual content = textual content.change(p, ' ')
    for w in textual content.decrease().break up():
        if w.change("'", '') != '':
            wrds[w] = wrds.get(w, 0) + 1
    return [y[0] for y in sorted(wrds.objects(), key=lambda x: x[1], reverse=True)[:3]]

Related Articles

LEAVE A REPLY

Please enter your comment!
Please enter your name here

Latest Articles