The problem
Write a perform that, given a string of textual content (probably with punctuation and line-breaks), returns an array of the top-3 most occurring phrases, in descending order of the variety of occurrences.
Assumptions:
- A phrase is a string of letters (A to Z) optionally containing a number of apostrophes (‘) in ASCII. (No have to deal with fancy punctuation.)
- Matches ought to be case-insensitive, and the phrases within the outcome ought to be lowercased.
- Ties could also be damaged arbitrarily.
- If a textual content incorporates fewer than three distinctive phrases, then both the top-2 or top-1 phrases ought to be returned, or an empty array if a textual content incorporates no phrases.
Examples:
top_3_words("In a village of La Mancha, the identify of which I've no want to name to
thoughts, there lived not lengthy since a type of gents that maintain a lance
within the lance-rack, an outdated buckler, a lean hack, and a greyhound for
coursing. An olla of quite extra beef than mutton, a salad on most
nights, scraps on Saturdays, lentils on Fridays, and a pigeon or so additional
on Sundays, made away with three-quarters of his revenue.")
# => ["a", "of", "on"]
top_3_words("e e e e DDD ddd DdD: ddd ddd aa aA Aa, bb cc cC e e e")
# => ["e", "ddd", "aa"]
top_3_words(" //wont will not will not")
# => ["won't", "wont"]
Bonus factors:
- Keep away from creating an array whose reminiscence footprint is roughly as large because the enter textual content.
- Keep away from sorting the whole array of distinctive phrases.
Take a look at instances
from random import selection, randint, pattern, shuffle, decisions
import re
from collections import Counter
def verify(s, this=None): # this: just for debugging function
returned_result = top_3_words(s) if that is None else this
fs = Counter(w for w in re.findall(r"[a-zA-Z']+", s.decrease()) if w != "'" * len(w))
exp,expected_frequencies = map(record,zip(*fs.most_common(3))) if fs else ([],[])
msg = ''
wrong_words = [w for w in returned_result if not fs[w]]
actual_freq = [fs[w] for w in returned_result]
if wrong_words:
msg = 'Incorrect match: phrases not current within the string. Your output: {}. One potential legitimate reply: {}'.format(returned_result, exp)
elif len(set(returned_result)) != len(returned_result):
msg = 'The outcome mustn't comprise copies of the identical phrase. Your output: {}. One potential output: {}'.format(returned_result, exp)
elif actual_freq!=expected_frequencies:
msg = "Incorrect frequencies: {} ought to be {}. Your output: {}. One potential output: {}".format(actual_freq, expected_frequencies, returned_result, exp)
Take a look at.count on(not msg, msg)
@take a look at.describe("Mounted assessments")
def fixed_tests():
TESTS = (
"a a a b c c d d d d e e e e e",
"e e e e DDD ddd DdD: ddd ddd aa aA Aa, bb cc cC e e e",
" //wont will not will not ",
" , e .. ",
" ... ",
" ' ",
" ''' ",
"""In a village of La Mancha, the identify of which I've no want to cao
thoughts, there lived not lengthy since a type of gents that maintain a lance
within the lance-rack, an outdated buckler, a lean hack, and a greyhound for
coursing. An olla of quite extra beef than mutton, a salad on most
nights, scraps on Saturdays, lentils on Fridays, and a pigeon or so additional
on Sundays, made away with three-quarters of his revenue.""",
"a a a b c c X",
"a a c b b",
)
for s in TESTS: verify(s)
@take a look at.describe("Random assessments")
def random_tests():
def gen_word():
return "".be a part of(selection("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'") for _ in vary(randint(3, 10)))
def gen_string():
phrases = []
nums = decisions(vary(1, 31), ok=20)
for _ in vary(randint(0, 20)):
phrases += [gen_word()] * nums.pop()
shuffle(phrases)
s = ""
whereas phrases:
s += phrases.pop() + "".be a part of(selection("-,.?!_:;/ ") for _ in vary(randint(1, 5)))
return s
@take a look at.it("Exams")
def it_1():
for _ in vary(100): verify(gen_string())
The answer utilizing Python
Choice 1:
# use the Counter module
from collections import Counter
# use the regex module
import re
def top_3_words(textual content):
# rely the enter, move by way of a regex and lowercase it
c = Counter(re.findall(r"[a-z']+", re.sub(r" '+ ", " ", textual content.decrease())))
# return the `most typical` 3 objects
return [w for w,_ in c.most_common(3)]
Choice 2:
def top_3_words(textual content):
# loop by way of every character within the string
for c in textual content:
# if it is not alphanumeric or an apostrophe
if not (c.isalpha() or c=="'"):
# change with an area
textual content = textual content.change(c,' ')
# create some `record` variables
phrases,counts,out = [],[],[]
# loop by way of the phrases within the textual content
for phrase in record(filter(None,textual content.decrease().break up())):
# if in all, then proceed
if all([not c.isalpha() for c in word]):
proceed
# if the phrase is within the phrases record
if phrase in phrases:
# increment the rely
counts[words.index(word)] += 1
else:
# in any other case create a brand new entry
phrases.append(phrase); counts.append(0)
# loop whereas larger than 0 and fewer than 3
whereas len(phrases)>0 and len(out)<3:
# append the counts
out.append(phrases.pop(counts.index(max(counts))).decrease())
counts.take away(max(counts))
# return the counts
return out
Choice 3:
def top_3_words(textual content):
wrds = {}
for p in r'!"#$%&()*+,./:;<=>[email protected][]^_`~-':
textual content = textual content.change(p, ' ')
for w in textual content.decrease().break up():
if w.change("'", '') != '':
wrds[w] = wrds.get(w, 0) + 1
return [y[0] for y in sorted(wrds.objects(), key=lambda x: x[1], reverse=True)[:3]]