NLP Tweet Split
Tweet Split
Given a set of Twitter hashtags, split each hashtag into its constituent words. For example:
wearethepeople is split into we are the people mentionyourfaves is split into mention your faves
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import re
def segment(tweet, cword):
# Sort cword in decreasing string order
cword.sort(key=len, reverse=True)
return tokenize(tweet, cword, "")
def tokenize(tweet, cword, token):
# Are we done yet?
if not tweet:
return [token]
# Find all possible prefixes
for pref in cword:
if tweet.startswith(pref):
res = tokenize(tweet[len(pref):], cword, pref)
return res + [token] if res else False
# Not possible
return False
def main():
num = int(input())
tweets = [input().strip() for _ in range(num)]
# Sample word list (you can replace it with a larger, more comprehensive list)
word_list = ["we", "are", "the", "people", "mention", "your", "faves", "now", "playing", "the", "walking", "dead", "follow", "me"]
for tweet in tweets:
result = segment(tweet, word_list)
print(' '.join(result) if result else tweet)
if __name__ == "__main__":
main()
This post is licensed under CC BY 4.0 by the author.