saiiz31’s diary

str1 = "xxグループおよび各カンパニーのミッションとバリューに共感していただける方, クレジットカード会社、貸金業またはそれに類する事業者で2年以上の与信・回収領域の業務経験がある方, 業務戦略・ロードマップの策定経験、またはそれと同様の経験, 開発ディレクションの経験, プロダクト開発への情熱"
str2 = "・複数のBI Toolのスキル、Python等のOSSのテクニカルスキル ¥n ・BI Toolを活用したデータ可視化環境の構築経験。¥n （特にCognos、Tableau の知見を持つ方は大歓迎） ¥n ・お客様の要望をヒアリングできるコミュニケーション力 ¥n 歓迎するスキル・経験 ¥n ・金融業界経験者、銀行業務知識保有者 ¥n ・DMP、データリクルーティング導入経験 ¥n ・営業、マーケティング、経営企画関連での実務経験 ¥n ・英語力"
import pandas as pd
df = pd.DataFrame([["SCI", str1], ["BI_ENG", str2]], columns=["JD名", "必要業務経験"])
### mecab 形態素解析
!pip install mecab-python3
import MeCab
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
!pip install ipadic
###参照: https://qiita.com/smiler5617/items/0744c256841875824ed2
###今後参照予定: https://qiita.com/nakahara-d/items/b374f4d59894e726ba75
import ipadic
mecab = MeCab.Tagger(ipadic.MECAB_ARGS) # インストールした辞書を指定
print(mecab.parse("私はYahooプレミアム会員になりました。"))
def fun_wakati(text):
wakati = mecab.parse(text)
return wakati
df["wakati"] = df["必要業務経験"].apply(fun_wakati)
mecab.parse("xxxグループおよび各カンパニーのミッションとバリューに共感していただける方, クレジット")
!pip install janome
###参照: https://asanonaoki.com/blog/janome%E3%81%A8mecab%E3%81%A7%E6%97%A5%E6%9C%AC%E8%AA%9E%E3%81%AE%E3%83%86%E3%82%AD%E3%82%B9%E3%83%88%E3%82%92%E5%8D%98%E8%AA%9E%E3%81%AB%E5%88%86%E3%81%91%E3%82%8B/
###参照(#janome利用詳細): https://nikkie-ftnext.hatenablog.com/entry/2020/01/12/224035
###参照(#janome github) https://mocobeta.github.io/janome/api/janome.html#janome.tokenfilter.TokenFilter
from janome.tokenizer import Tokenizer
t = Tokenizer(wakati=True)
def fun_janome_tokenz(text):
words = t.tokenize(text)
return list(words)
df["janome_tok"] = df["必要業務経験"].apply(fun_janome_tokenz)
df
#### 簡単分解と単語集計
###参照(#頻出単語分析): https://qiita.com/t_nishimaki/items/31ecd37b784224603047
def tokenize(text):
vectorizer = TfidfVectorizer(tokenizer=tokenize)
def word_analysis(doc):
node = mecab.parseToNode(doc)
meishi_list =
doshi_list =
keiyoshi_list = []
while node:
hinshi = node.feature.split(",")[0]
if hinshi == "名詞":
meishi_list.append(node.surface)
elif hinshi == "動詞":
doshi_list.append(node.feature.split(",")[6])
elif hinshi == "形容詞":
keiyoshi_list.append(node.feature.split(",")[6])

node = node.next

return pd.Series([list(set(meishi_list)), list(set(doshi_list)), list(set(keiyoshi_list))])
df[["名詞","動詞","形容詞"]] = df["必要業務経験"].apply(word_analysis)
df
import itertools
import collections
words = list(itertools.chain.from_iterable(df["名詞"]))
c = collections.Counter(words)
common_noun = ['業務', '方', '者', '経験']
tmp = pd.DataFrame(c.most_common())
tmp.rename(columns={0:'単語', 1:'件数'}, inplace=True)
tmp = tmp[~tmp['単語'].isin(common_noun)]
tmp
df_count = pd.DataFrame(columns=['カテゴリ', '単語', '件数'])
for category in df['JD名'].unique().tolist():
words = list(itertools.chain.from_iterable(df["名詞"]))

for common_word in common_noun:
words.remove(common_word)

c = collections.Counter(words)

tmp = pd.DataFrame(c.most_common())
tmp.rename(columns={0:'単語', 1:'件数'}, inplace=True)
tmp['カテゴリ'] = category
tmp = tmp[['カテゴリ', '単語', '件数']]
df_count = pd.concat([df_count, tmp])
df_count