Post

NLP Tweet Split

Tweet Split

Given a set of Twitter hashtags, split each hashtag into its constituent words. For example:

wearethepeople is split into we are the people mentionyourfaves is split into mention your faves

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import re

def segment(tweet, cword):
    # Sort cword in decreasing string order
    cword.sort(key=len, reverse=True)
    return tokenize(tweet, cword, "")

def tokenize(tweet, cword, token):
    # Are we done yet?
    if not tweet:
        return [token]
    # Find all possible prefixes
    for pref in cword:
        if tweet.startswith(pref):
            res = tokenize(tweet[len(pref):], cword, pref)
            return res + [token] if res else False
    # Not possible
    return False

def main():
    num = int(input())
    tweets = [input().strip() for _ in range(num)]
    # Sample word list (you can replace it with a larger, more comprehensive list)
    word_list = ["we", "are", "the", "people", "mention", "your", "faves", "now", "playing", "the", "walking", "dead", "follow", "me"]

    for tweet in tweets:
        result = segment(tweet, word_list)
        print(' '.join(result) if result else tweet)

if __name__ == "__main__":
    main()

This post is licensed under CC BY 4.0 by the author.